diff --git a/commands/package.go b/commands/package.go index 3e3cec02..d9a41c6b 100644 --- a/commands/package.go +++ b/commands/package.go @@ -74,6 +74,7 @@ func newPackagedCmd() *cobra.Command { } c.Flags().StringVar(&opts.ggufPath, "gguf", "", "absolute path to gguf file (required)") + c.Flags().StringVar(&opts.chatTemplatePath, "chat-template", "", "absolute path to chat template file (must be Jinja format)") c.Flags().StringArrayVarP(&opts.licensePaths, "license", "l", nil, "absolute path to a license file") c.Flags().BoolVar(&opts.push, "push", false, "push to registry (if not set, the model is loaded into the Model Runner content store)") c.Flags().Uint64Var(&opts.contextSize, "context-size", 0, "context size in tokens") @@ -81,11 +82,12 @@ func newPackagedCmd() *cobra.Command { } type packageOptions struct { - ggufPath string - licensePaths []string - push bool - contextSize uint64 - tag string + chatTemplatePath string + contextSize uint64 + ggufPath string + licensePaths []string + push bool + tag string } func packageModel(cmd *cobra.Command, opts packageOptions) error { @@ -126,6 +128,13 @@ func packageModel(cmd *cobra.Command, opts packageOptions) error { } } + if opts.chatTemplatePath != "" { + cmd.PrintErrf("Adding chat template file from %q\n", opts.chatTemplatePath) + if pkg, err = pkg.WithChatTemplateFile(opts.chatTemplatePath); err != nil { + return fmt.Errorf("add chat template file from path %q: %w", opts.chatTemplatePath, err) + } + } + if opts.push { cmd.PrintErrln("Pushing model to registry...") } else { diff --git a/docs/reference/docker_model_package.yaml b/docs/reference/docker_model_package.yaml index 36fbc388..712a9480 100644 --- a/docs/reference/docker_model_package.yaml +++ b/docs/reference/docker_model_package.yaml @@ -8,6 +8,15 @@ usage: docker model package --gguf [--license ...] [--context-size pname: docker model plink: docker_model.yaml options: + - option: chat-template + value_type: string + description: absolute path to chat template file (must be Jinja format) + deprecated: false + hidden: false + experimental: false + experimentalcli: false + kubernetes: false + swarm: false - option: context-size value_type: uint64 default_value: "0" diff --git a/docs/reference/model_package.md b/docs/reference/model_package.md index 62dc7d89..a0448f79 100644 --- a/docs/reference/model_package.md +++ b/docs/reference/model_package.md @@ -8,6 +8,7 @@ When packaging a sharded model --gguf should point to the first shard. All shard | Name | Type | Default | Description | |:------------------|:--------------|:--------|:---------------------------------------------------------------------------------------| +| `--chat-template` | `string` | | absolute path to chat template file (must be Jinja format) | | `--context-size` | `uint64` | `0` | context size in tokens | | `--gguf` | `string` | | absolute path to gguf file (required) | | `-l`, `--license` | `stringArray` | | absolute path to a license file | diff --git a/go.mod b/go.mod index 6ac2df9c..0719d66e 100644 --- a/go.mod +++ b/go.mod @@ -11,7 +11,7 @@ require ( github.com/docker/docker v28.2.2+incompatible github.com/docker/go-connections v0.5.0 github.com/docker/go-units v0.5.0 - github.com/docker/model-distribution v0.0.0-20250905083217-3f098b3d8058 + github.com/docker/model-distribution v0.0.0-20250918153037-7d9fc7b72b57 github.com/docker/model-runner v0.0.0-20250911130340-38bb0171c947 github.com/fatih/color v1.15.0 github.com/google/go-containerregistry v0.20.6 @@ -55,7 +55,7 @@ require ( github.com/gogo/protobuf v1.3.2 // indirect github.com/google/uuid v1.6.0 // indirect github.com/gorilla/mux v1.8.1 // indirect - github.com/gpustack/gguf-parser-go v0.14.1 // indirect + github.com/gpustack/gguf-parser-go v0.22.1 // indirect github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.1 // indirect github.com/henvic/httpretty v0.1.4 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect @@ -84,7 +84,6 @@ require ( github.com/prometheus/common v0.65.0 // indirect github.com/prometheus/procfs v0.15.1 // indirect github.com/rivo/uniseg v0.4.7 // indirect - github.com/rs/dnscache v0.0.0-20230804202142-fc85eb664529 // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect github.com/sirupsen/logrus v1.9.3 // indirect github.com/smallnest/ringbuffer v0.0.0-20241116012123-461381446e3d // indirect diff --git a/go.sum b/go.sum index 98238031..7f8150d5 100644 --- a/go.sum +++ b/go.sum @@ -80,8 +80,8 @@ github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDD github.com/docker/go-winjob v0.0.0-20250829235554-57b487ebcbc5 h1:dxSFEb0EEmvceIawSFNDMrvKakRz2t+2WYpY3dFAT04= github.com/docker/go-winjob v0.0.0-20250829235554-57b487ebcbc5/go.mod h1:ICOGmIXdwhfid7rQP+tLvDJqVg0lHdEk3pI5nsapTtg= github.com/docker/libtrust v0.0.0-20160708172513-aabc10ec26b7/go.mod h1:cyGadeNEkKy96OOhEzfZl+yxihPEzKnqJwvfuSUqbZE= -github.com/docker/model-distribution v0.0.0-20250905083217-3f098b3d8058 h1:whffgQ1pmiMFVrxRhJKA9yyCJXvmVX6iiohU9ezKCx0= -github.com/docker/model-distribution v0.0.0-20250905083217-3f098b3d8058/go.mod h1:dThpO9JoG5Px3i+rTluAeZcqLGw8C0qepuEL4gL2o/c= +github.com/docker/model-distribution v0.0.0-20250918153037-7d9fc7b72b57 h1:WHiPO9UmO5v97T3ksQUA2SbYVkTdUCSFobznegL97kk= +github.com/docker/model-distribution v0.0.0-20250918153037-7d9fc7b72b57/go.mod h1:bV1RH2e79nTwOW38GoMU9UO8gpZVLH9+cZeEeR4wSeE= github.com/docker/model-runner v0.0.0-20250911130340-38bb0171c947 h1:6Dz1SFZONEd8tlKetn2Gu6v5HDJI/YtUFwkqHGwrsV0= github.com/docker/model-runner v0.0.0-20250911130340-38bb0171c947/go.mod h1:cl7panafjkSHllYCCGYAzty2aUvbwk55Gi35v06XL80= github.com/dvsekhvalnov/jose2go v0.0.0-20170216131308-f21a8cedbbae/go.mod h1:7BvyPhdbLxMXIYTFPLsyJRFMsKmOZnQmzh6Gb+uquuM= @@ -130,8 +130,8 @@ github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+ github.com/gorilla/mux v1.7.0/go.mod h1:1lud6UwP+6orDFRuTfBEV8e9/aOM/c4fVVCaMa2zaAs= github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY= github.com/gorilla/mux v1.8.1/go.mod h1:AKf9I4AEqPTmMytcMc0KkNouC66V3BtZ4qD5fmWSiMQ= -github.com/gpustack/gguf-parser-go v0.14.1 h1:tmz2eTnSEFfE52V10FESqo9oAUquZ6JKQFntWC/wrEg= -github.com/gpustack/gguf-parser-go v0.14.1/go.mod h1:GvHh1Kvvq5ojCOsJ5UpwiJJmIjFw3Qk5cW7R+CZ3IJo= +github.com/gpustack/gguf-parser-go v0.22.1 h1:FRnEDWqT0Rcplr/R9ctCRSN2+3DhVsf6dnR5/i9JA4E= +github.com/gpustack/gguf-parser-go v0.22.1/go.mod h1:y4TwTtDqFWTK+xvprOjRUh+dowgU2TKCX37vRKvGiZ0= github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.1 h1:e9Rjr40Z98/clHv5Yg79Is0NtosR5LXRvdr7o/6NwbA= github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.1/go.mod h1:tIxuGz/9mpox++sgp9fJjHO0+q1X9/UOWd798aAm22M= github.com/hailocab/go-hostpool v0.0.0-20160125115350-e80d13ce29ed/go.mod h1:tMWxXQ9wFIaZeTI9F+hmhFiGpFmhOHzyShyFUhRm0H4= @@ -243,8 +243,6 @@ github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ= github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= -github.com/rs/dnscache v0.0.0-20230804202142-fc85eb664529 h1:18kd+8ZUlt/ARXhljq+14TwAoKa61q6dX8jtwOf6DH8= -github.com/rs/dnscache v0.0.0-20230804202142-fc85eb664529/go.mod h1:qe5TWALJ8/a1Lqznoc5BDHpYX/8HU60Hm2AwRmqzxqA= github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/sirupsen/logrus v1.0.6/go.mod h1:pMByvHTf9Beacp5x1UXfOR9xyW/9antXMhjMPG0dEzc= diff --git a/vendor/github.com/docker/model-distribution/builder/builder.go b/vendor/github.com/docker/model-distribution/builder/builder.go index 871e462a..659d1e8e 100644 --- a/vendor/github.com/docker/model-distribution/builder/builder.go +++ b/vendor/github.com/docker/model-distribution/builder/builder.go @@ -55,6 +55,17 @@ func (b *Builder) WithMultimodalProjector(path string) (*Builder, error) { }, nil } +// WithChatTemplateFile adds a Jinja chat template file to the artifact which takes precedence over template from GGUF. +func (b *Builder) WithChatTemplateFile(path string) (*Builder, error) { + templateLayer, err := partial.NewLayer(path, types.MediaTypeChatTemplate) + if err != nil { + return nil, fmt.Errorf("chat template layer from %q: %w", path, err) + } + return &Builder{ + model: mutate.AppendLayers(b.model, templateLayer), + }, nil +} + // Target represents a build target type Target interface { Write(context.Context, types.ModelArtifact, io.Writer) error diff --git a/vendor/github.com/docker/model-distribution/internal/bundle/bundle.go b/vendor/github.com/docker/model-distribution/internal/bundle/bundle.go index a32b8031..5476fc55 100644 --- a/vendor/github.com/docker/model-distribution/internal/bundle/bundle.go +++ b/vendor/github.com/docker/model-distribution/internal/bundle/bundle.go @@ -8,10 +8,11 @@ import ( // Bundle represents a runtime bundle containing a model and runtime config type Bundle struct { - dir string - mmprojPath string - ggufFile string // path to GGUF file (first shard when model is split among files) - runtimeConfig types.Config + dir string + mmprojPath string + ggufFile string // path to GGUF file (first shard when model is split among files) + runtimeConfig types.Config + chatTemplatePath string } // RootDir return the path to the bundle root directory @@ -36,6 +37,14 @@ func (b *Bundle) MMPROJPath() string { return filepath.Join(b.dir, b.mmprojPath) } +// ChatTemplatePath return the path to a Jinja chat template file or "" if none is present. +func (b *Bundle) ChatTemplatePath() string { + if b.chatTemplatePath == "" { + return "" + } + return filepath.Join(b.dir, b.chatTemplatePath) +} + // RuntimeConfig returns config that should be respected by the backend at runtime. func (b *Bundle) RuntimeConfig() types.Config { return b.runtimeConfig diff --git a/vendor/github.com/docker/model-distribution/internal/bundle/parse.go b/vendor/github.com/docker/model-distribution/internal/bundle/parse.go index 016254cb..93912dac 100644 --- a/vendor/github.com/docker/model-distribution/internal/bundle/parse.go +++ b/vendor/github.com/docker/model-distribution/internal/bundle/parse.go @@ -22,15 +22,20 @@ func Parse(rootDir string) (*Bundle, error) { if err != nil { return nil, err } + templatePath, err := findChatTemplateFile(rootDir) + if err != nil { + return nil, err + } cfg, err := parseRuntimeConfig(rootDir) if err != nil { return nil, err } return &Bundle{ - dir: rootDir, - mmprojPath: mmprojPath, - ggufFile: ggufPath, - runtimeConfig: cfg, + dir: rootDir, + mmprojPath: mmprojPath, + ggufFile: ggufPath, + runtimeConfig: cfg, + chatTemplatePath: templatePath, }, nil } @@ -71,3 +76,17 @@ func findMultiModalProjectorFile(rootDir string) (string, error) { } return filepath.Base(mmprojPaths[0]), nil } + +func findChatTemplateFile(rootDir string) (string, error) { + templatePaths, err := filepath.Glob(filepath.Join(rootDir, "[^.]*.jinja")) + if err != nil { + return "", err + } + if len(templatePaths) == 0 { + return "", nil + } + if len(templatePaths) > 1 { + return "", fmt.Errorf("found multiple template files, but only 1 is supported") + } + return filepath.Base(templatePaths[0]), nil +} diff --git a/vendor/github.com/docker/model-distribution/internal/bundle/unpack.go b/vendor/github.com/docker/model-distribution/internal/bundle/unpack.go index 5fe6a23e..f44069e5 100644 --- a/vendor/github.com/docker/model-distribution/internal/bundle/unpack.go +++ b/vendor/github.com/docker/model-distribution/internal/bundle/unpack.go @@ -20,6 +20,9 @@ func Unpack(dir string, model types.Model) (*Bundle, error) { if err := unpackMultiModalProjector(bundle, model); err != nil { return nil, fmt.Errorf("add multi-model projector file to runtime bundle: %w", err) } + if err := unpackTemplate(bundle, model); err != nil { + return nil, fmt.Errorf("add chat template file to runtime bundle: %w", err) + } if err := unpackRuntimeConfig(bundle, model); err != nil { return nil, fmt.Errorf("add config.json to runtime bundle: %w", err) } @@ -80,6 +83,18 @@ func unpackMultiModalProjector(bundle *Bundle, mdl types.Model) error { return nil } +func unpackTemplate(bundle *Bundle, mdl types.Model) error { + path, err := mdl.ChatTemplatePath() + if err != nil { + return nil // no such file + } + if err = unpackFile(filepath.Join(bundle.dir, "template.jinja"), path); err != nil { + return err + } + bundle.chatTemplatePath = "template.jinja" + return nil +} + func unpackFile(bundlePath string, srcPath string) error { return os.Link(srcPath, bundlePath) } diff --git a/vendor/github.com/docker/model-distribution/internal/partial/partial.go b/vendor/github.com/docker/model-distribution/internal/partial/partial.go index 7367556c..8d6c3a27 100644 --- a/vendor/github.com/docker/model-distribution/internal/partial/partial.go +++ b/vendor/github.com/docker/model-distribution/internal/partial/partial.go @@ -84,6 +84,21 @@ func MMPROJPath(i WithLayers) (string, error) { return paths[0], err } +func ChatTemplatePath(i WithLayers) (string, error) { + paths, err := layerPathsByMediaType(i, types.MediaTypeChatTemplate) + if err != nil { + return "", fmt.Errorf("get chat template layer paths: %w", err) + } + if len(paths) == 0 { + return "", fmt.Errorf("model does not contain any layer of type %q", types.MediaTypeChatTemplate) + } + if len(paths) > 1 { + return "", fmt.Errorf("found %d files of type %q, expected exactly 1", + len(paths), types.MediaTypeChatTemplate) + } + return paths[0], err +} + // layerPathsByMediaType is a generic helper function that finds a layer by media type and returns its path func layerPathsByMediaType(i WithLayers, mediaType ggcr.MediaType) ([]string, error) { layers, err := i.Layers() diff --git a/vendor/github.com/docker/model-distribution/internal/store/model.go b/vendor/github.com/docker/model-distribution/internal/store/model.go index b35539a6..bd3a4fa0 100644 --- a/vendor/github.com/docker/model-distribution/internal/store/model.go +++ b/vendor/github.com/docker/model-distribution/internal/store/model.go @@ -118,6 +118,10 @@ func (m *Model) MMPROJPath() (string, error) { return mdpartial.MMPROJPath(m) } +func (m *Model) ChatTemplatePath() (string, error) { + return mdpartial.ChatTemplatePath(m) +} + func (m *Model) Tags() []string { return m.tags } diff --git a/vendor/github.com/docker/model-distribution/types/config.go b/vendor/github.com/docker/model-distribution/types/config.go index 8211dd2a..0261a9f9 100644 --- a/vendor/github.com/docker/model-distribution/types/config.go +++ b/vendor/github.com/docker/model-distribution/types/config.go @@ -23,6 +23,9 @@ const ( // MediaTypeMultimodalProjector indicates a Multimodal projector file MediaTypeMultimodalProjector = types.MediaType("application/vnd.docker.ai.mmproj") + // MediaTypeChatTemplate indicates a Jinja chat template + MediaTypeChatTemplate = types.MediaType("application/vnd.docker.ai.chat.template.jinja") + FormatGGUF = Format("gguf") ) diff --git a/vendor/github.com/docker/model-distribution/types/model.go b/vendor/github.com/docker/model-distribution/types/model.go index 62374c02..7f9ba394 100644 --- a/vendor/github.com/docker/model-distribution/types/model.go +++ b/vendor/github.com/docker/model-distribution/types/model.go @@ -11,6 +11,7 @@ type Model interface { Config() (Config, error) Tags() []string Descriptor() (Descriptor, error) + ChatTemplatePath() (string, error) } type ModelArtifact interface { @@ -23,6 +24,7 @@ type ModelArtifact interface { type ModelBundle interface { RootDir() string GGUFPath() string + ChatTemplatePath() string MMPROJPath() string RuntimeConfig() Config } diff --git a/vendor/github.com/gpustack/gguf-parser-go/.golangci.yaml b/vendor/github.com/gpustack/gguf-parser-go/.golangci.yaml index 480355ee..f514dad7 100644 --- a/vendor/github.com/gpustack/gguf-parser-go/.golangci.yaml +++ b/vendor/github.com/gpustack/gguf-parser-go/.golangci.yaml @@ -1,3 +1,5 @@ +version: "1" + run: timeout: 10m tests: true @@ -8,7 +10,6 @@ run: output: print-issued-lines: true print-linter-name: true - uniq-by-line: true path-prefix: "" sort-results: true @@ -22,7 +23,7 @@ linters: - errcheck - errname - errorlint - - exportloopref + - copyloopvar - godot - goconst - gocritic @@ -83,6 +84,7 @@ linters-settings: - G101 - G107 - G112 + - G115 - G404 gofumpt: extra-rules: true @@ -119,7 +121,6 @@ linters-settings: unused: field-writes-are-uses: true post-statements-are-reads: true - exported-is-used: true exported-fields-are-used: true parameters-are-used: true local-variables-are-used: true @@ -133,6 +134,7 @@ linters-settings: crypto-hash: true issues: + uniq-by-line: true exclude-files: - "doc.go" - "zz_generated.*.go" diff --git a/vendor/github.com/gpustack/gguf-parser-go/Makefile b/vendor/github.com/gpustack/gguf-parser-go/Makefile index 1eea47d6..2834fb6c 100644 --- a/vendor/github.com/gpustack/gguf-parser-go/Makefile +++ b/vendor/github.com/gpustack/gguf-parser-go/Makefile @@ -34,33 +34,33 @@ generate: lint: @echo "+++ $@ +++" - if [[ "$(LINT_DIRTY)" == "true" ]]; then \ - if [[ -n $$(git status --porcelain) ]]; then \ - echo "Code tree is dirty."; \ - git diff --exit-code; \ - fi; \ - fi - [[ -d "$(SRCDIR)/.sbin" ]] || mkdir -p "$(SRCDIR)/.sbin" [[ -f "$(SRCDIR)/.sbin/goimports-reviser" ]] || \ - curl --retry 3 --retry-all-errors --retry-delay 3 -sSfL "https://github.com/incu6us/goimports-reviser/releases/download/v3.6.5/goimports-reviser_3.6.5_$(GOOS)_$(GOARCH).tar.gz" \ + curl --retry 3 --retry-all-errors --retry-delay 3 -sSfL "https://github.com/incu6us/goimports-reviser/releases/download/v3.8.2/goimports-reviser_3.8.2_$(GOOS)_$(GOARCH).tar.gz" \ | tar -zxvf - --directory "$(SRCDIR)/.sbin" --no-same-owner --exclude ./LICENSE --exclude ./README.md && chmod +x "$(SRCDIR)/.sbin/goimports-reviser" cd $(SRCDIR) && \ go list -f "{{.Dir}}" ./... | xargs -I {} find {} -maxdepth 1 -type f -name '*.go' ! -name 'gen.*' ! -name 'zz_generated.*' \ - | xargs -I {} "$(SRCDIR)/.sbin/goimports-reviser" -use-cache -imports-order=std,general,company,project,blanked,dotted -output=file {} + | xargs -I {} "$(SRCDIR)/.sbin/goimports-reviser" -use-cache -imports-order=std,general,company,project,blanked,dotted -output=file {} 1>/dev/null 2>&1 cd $(SRCDIR)/cmd/gguf-parser && \ go list -f "{{.Dir}}" ./... | xargs -I {} find {} -maxdepth 1 -type f -name '*.go' ! -name 'gen.*' ! -name 'zz_generated.*' \ - | xargs -I {} "$(SRCDIR)/.sbin/goimports-reviser" -use-cache -imports-order=std,general,company,project,blanked,dotted -output=file {} + | xargs -I {} "$(SRCDIR)/.sbin/goimports-reviser" -use-cache -imports-order=std,general,company,project,blanked,dotted -output=file {} 1>/dev/null 2>&1 [[ -f "$(SRCDIR)/.sbin/golangci-lint" ]] || \ curl --retry 3 --retry-all-errors --retry-delay 3 -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh \ - | sh -s -- -b "$(SRCDIR)/.sbin" "v1.59.0" + | sh -s -- -b "$(SRCDIR)/.sbin" "v1.63.4" cd $(SRCDIR) && \ "$(SRCDIR)/.sbin/golangci-lint" run --fix ./... cd $(SRCDIR)/cmd/gguf-parser && \ "$(SRCDIR)/.sbin/golangci-lint" run --fix ./... + if [[ "$(LINT_DIRTY)" == "true" ]]; then \ + if [[ -n $$(git status --porcelain) ]]; then \ + echo "Code tree is dirty."; \ + git diff --exit-code; \ + fi; \ + fi + @echo "--- $@ ---" test: @@ -99,7 +99,7 @@ gguf-parser: if [[ $$os == "darwin" ]]; then \ [[ -d "$(SRCDIR)/.sbin" ]] || mkdir -p "$(SRCDIR)/.sbin"; \ [[ -f "$(SRCDIR)/.sbin/lipo" ]] || \ - GOBIN="$(SRCDIR)/.sbin" go install github.com/konoui/lipo@v0.9.1; \ + GOBIN="$(SRCDIR)/.sbin" go install github.com/konoui/lipo@v0.9.2; \ "$(SRCDIR)/.sbin/lipo" -create -output $(SRCDIR)/.dist/gguf-parser-darwin-universal $(SRCDIR)/.dist/gguf-parser-darwin-amd64 $(SRCDIR)/.dist/gguf-parser-darwin-arm64; \ fi;\ if [[ $$os == "$(GOOS)" ]] && [[ $$arch == "$(GOARCH)" ]]; then \ @@ -126,7 +126,7 @@ package: build fi; \ if [[ "$(PACKAGE_PUBLISH)" == "true" ]]; then \ if [[ -z $$(docker buildx inspect --builder "gguf-parser") ]]; then \ - docker run --rm --privileged tonistiigi/binfmt:qemu-v7.0.0 --install $$platform; \ + docker run --rm --privileged tonistiigi/binfmt:qemu-v9.2.2 --install $$platform; \ docker buildx create --name "gguf-parser" --driver "docker-container" --buildkitd-flags "--allow-insecure-entitlement security.insecure --allow-insecure-entitlement network.host" --bootstrap; \ fi; \ docker buildx build --progress=plain --platform=$$platform --builder="gguf-parser" --output="type=image,name=$$image,push=true" "$(SRCDIR)"; \ @@ -137,4 +137,4 @@ package: build @echo "--- $@ ---" -ci: deps generate test lint build +ci: deps generate lint test build diff --git a/vendor/github.com/gpustack/gguf-parser-go/README.md b/vendor/github.com/gpustack/gguf-parser-go/README.md index 96c6804c..52d6f5f2 100644 --- a/vendor/github.com/gpustack/gguf-parser-go/README.md +++ b/vendor/github.com/gpustack/gguf-parser-go/README.md @@ -56,6 +56,16 @@ download it. ## Notes +- **Since v0.20.0**, GGUF Parser supports leveraging `--override-tensor` to indicate how to place the model tensors. +- **Since v0.19.0**, GGUF Parser supports estimating Audio projector model file, like Ultravox series, Qwen2 Audio + series, etc. +- **Since v0.18.0**, GGUF Parser supports estimating SWA-supported(sliding window attention) model file, like LLaMA 4 + series, Gemma2/3 series, etc. +- **Since v0.17.0**, GGUF Parser align the `QUANTIZATION`( + aka. [`general.file_type`](https://github.com/ggml-org/ggml/blob/master/docs/gguf.md#general-metadata)) + to [HuggingFace processing](https://github.com/huggingface/huggingface.js/blob/2475d6d316135c0a4fceff6b3fe2aed0dde36ac1/packages/gguf/src/types.ts#L11-L48), + but there are still many model files whose naming does not fully follow `general.file_type`. +- **Since v0.16.0**, GGUF Parser supports estimating MLA-supported model file, like DeepSeek series. - **Since v0.14.0 (BREAKING CHANGE)**, GGUF Parser parses `*.feed_forward_length` metadata as `[]uint64`, which means the architecture `feedForwardLength` is a list of integers. - **Since v0.13.0 (BREAKING CHANGE)**, GGUF Parser can parse files @@ -93,21 +103,21 @@ Install from [releases](https://github.com/gpustack/gguf-parser-go/releases). ```shell $ gguf-parser --path ~/.cache/lm-studio/models/unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF/DeepSeek-R1-Distill-Qwen-7B-Q4_K_M.gguf -+-------------------------------------------------------------------------------------------------------------+ -| METADATA | -+-------+-------------------------+-------+----------------+---------------+----------+------------+----------+ -| TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | -+-------+-------------------------+-------+----------------+---------------+----------+------------+----------+ -| model | DeepSeek R1 Distill ... | qwen2 | IQ2_XXS/Q4_K_M | true | 4.36 GiB | 7.62 B | 4.91 bpw | -+-------+-------------------------+-------+----------------+---------------+----------+------------+----------+ - -+---------------------------------------------------------------------------------------------------------------------------------------------------+ -| ARCHITECTURE | -+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ -| MAX CONTEXT LEN | EMBEDDING LEN | EMBEDDING GQA | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN | -+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ -| 131072 | 3584 | 7 | true | 28 | 28 | 18944 | 0 | 152064 | -+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ ++-----------------------------------------------------------------------------------------------------------+ +| METADATA | ++-------+-------------------------+-------+--------------+---------------+----------+------------+----------+ +| TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | ++-------+-------------------------+-------+--------------+---------------+----------+------------+----------+ +| model | DeepSeek R1 Distill ... | qwen2 | Q4_K_M | true | 4.36 GiB | 7.62 B | 4.91 bpw | ++-------+-------------------------+-------+--------------+---------------+----------+------------+----------+ + ++-----------------------------------------------------------------------------------------------------------------------------------+ +| ARCHITECTURE | ++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ +| MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN | ++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ +| 131072 | 3584 | true | 28 | 28 | 18944 | 0 | 152064 | ++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ +-------------------------------------------------------------------------------------------------------------------------------------------------------+ | TOKENIZER | @@ -117,15 +127,15 @@ $ gguf-parser --path ~/.cache/lm-studio/models/unsloth/DeepSeek-R1-Distill-Qwen- | gpt2 | 2.47 MiB | 152064 | N/A | 151646 | 151643 | N/A | N/A | N/A | N/A | 151654 | +-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+ -+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| ESTIMATE | -+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+----------------------------------------------+-------------------------------------+ -| ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | RERANKING | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 | -| | | | | | | | | | +--------------------+------------+------------+----------------+--------+-----------+ -| | | | | | | | | | | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | -+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+--------+-----------+ -| qwen2 | 131072 | 2048 / 512 | Disabled | Enabled | No | Unsupported | Supported | 29 (28 + 1) | Yes | 1 + 0 + 0 | 654.26 MiB | 804.26 MiB | 28 + 1 | 7 GiB | 18.59 GiB | -+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+--------+-----------+ ++-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| ESTIMATE | ++-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+----------------------------------------------+---------------------------------------+ +| ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | RERANKING | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 | +| | | | | | | | | | +--------------------+------------+------------+----------------+----------+-----------+ +| | | | | | | | | | | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | ++-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+ +| qwen2 | 131072 | 2048 / 512 | Disabled | Enabled | No | Unsupported | Supported | 29 (28 + 1) | Yes | 1 + 0 + 0 | 677.44 MiB | 827.44 MiB | 28 + 1 | 7.30 GiB | 18.89 GiB | ++-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+ $ # Retrieve the model's metadata via split file, $ # which needs all split files has been downloaded. @@ -138,13 +148,13 @@ $ gguf-parser --path ~/.cache/lm-studio/models/Qwen/Qwen2.5-7B-Instruct-GGUF/qw | model | qwen2.5-7b-instruct | qwen2 | Q8_0 | true | 7.54 GiB | 7.62 B | 8.50 bpw | +-------+---------------------+-------+--------------+---------------+----------+------------+----------+ -+---------------------------------------------------------------------------------------------------------------------------------------------------+ -| ARCHITECTURE | -+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ -| MAX CONTEXT LEN | EMBEDDING LEN | EMBEDDING GQA | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN | -+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ -| 131072 | 3584 | 7 | true | 28 | 28 | 18944 | 0 | 152064 | -+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ ++-----------------------------------------------------------------------------------------------------------------------------------+ +| ARCHITECTURE | ++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ +| MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN | ++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ +| 131072 | 3584 | true | 28 | 28 | 18944 | 0 | 152064 | ++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ +-------------------------------------------------------------------------------------------------------------------------------------------------------+ | TOKENIZER | @@ -154,29 +164,36 @@ $ gguf-parser --path ~/.cache/lm-studio/models/Qwen/Qwen2.5-7B-Instruct-GGUF/qw | gpt2 | 2.47 MiB | 152064 | N/A | 151643 | 151645 | N/A | N/A | N/A | N/A | 151643 | +-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+ -+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - ++-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| ESTIMATE | ++-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+----------------------------------------------+---------------------------------------+ +| ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | RERANKING | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 | +| | | | | | | | | | +--------------------+------------+------------+----------------+----------+-----------+ +| | | | | | | | | | | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | ++-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+ +| qwen2 | 131072 | 2048 / 512 | Disabled | Enabled | No | Unsupported | Supported | 29 (28 + 1) | Yes | 1 + 0 + 0 | 677.44 MiB | 827.44 MiB | 28 + 1 | 7.30 GiB | 21.82 GiB | ++-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+ ``` #### Parse Remote File ```shell $ gguf-parser --url="https://huggingface.co/bartowski/Qwen2.5-72B-Instruct-GGUF/resolve/main/Qwen2.5-72B-Instruct-Q4_K_M.gguf" -+-----------------------------------------------------------------------------------------------------------+ -| METADATA | -+-------+----------------------+-------+----------------+---------------+-----------+------------+----------+ -| TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | -+-------+----------------------+-------+----------------+---------------+-----------+------------+----------+ -| model | Qwen2.5 72B Instruct | qwen2 | IQ2_XXS/Q4_K_M | true | 44.15 GiB | 72.71 B | 5.22 bpw | -+-------+----------------------+-------+----------------+---------------+-----------+------------+----------+ - -+---------------------------------------------------------------------------------------------------------------------------------------------------+ -| ARCHITECTURE | -+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ -| MAX CONTEXT LEN | EMBEDDING LEN | EMBEDDING GQA | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN | -+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ -| 32768 | 8192 | 8 | true | 64 | 80 | 29568 | 0 | 152064 | -+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ ++---------------------------------------------------------------------------------------------------------+ +| METADATA | ++-------+----------------------+-------+--------------+---------------+-----------+------------+----------+ +| TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | ++-------+----------------------+-------+--------------+---------------+-----------+------------+----------+ +| model | Qwen2.5 72B Instruct | qwen2 | Q4_K_M | true | 44.15 GiB | 72.71 B | 5.22 bpw | ++-------+----------------------+-------+--------------+---------------+-----------+------------+----------+ + ++-----------------------------------------------------------------------------------------------------------------------------------+ +| ARCHITECTURE | ++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ +| MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN | ++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ +| 32768 | 8192 | true | 64 | 80 | 29568 | 0 | 152064 | ++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ +-------------------------------------------------------------------------------------------------------------------------------------------------------+ | TOKENIZER | @@ -186,15 +203,15 @@ $ gguf-parser --url="https://huggingface.co/bartowski/Qwen2.5-72B-Instruct-GGUF/ | gpt2 | 2.47 MiB | 152064 | N/A | 151643 | 151645 | N/A | N/A | N/A | N/A | 151643 | +-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+ -+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| ESTIMATE | -+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+----------------------------------------------+-------------------------------------+ -| ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | RERANKING | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 | -| | | | | | | | | | +--------------------+------------+------------+----------------+--------+-----------+ -| | | | | | | | | | | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | -+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+--------+-----------+ -| qwen2 | 32768 | 2048 / 512 | Disabled | Enabled | No | Unsupported | Supported | 81 (80 + 1) | Yes | 1 + 0 + 0 | 403.39 MiB | 553.39 MiB | 80 + 1 | 10 GiB | 57.87 GiB | -+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+--------+-----------+ ++--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| ESTIMATE | ++-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+----------------------------------------------+----------------------------------------+ +| ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | RERANKING | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 | +| | | | | | | | | | +--------------------+------------+------------+----------------+-----------+-----------+ +| | | | | | | | | | | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | ++-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+-----------+-----------+ +| qwen2 | 32768 | 2048 / 512 | Disabled | Enabled | No | Unsupported | Supported | 81 (80 + 1) | Yes | 1 + 0 + 0 | 426.57 MiB | 576.57 MiB | 80 + 1 | 10.31 GiB | 58.18 GiB | ++-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+-----------+-----------+ $ # Retrieve the model's metadata via split file @@ -204,16 +221,16 @@ $ gguf-parser --url="https://huggingface.co/unsloth/DeepSeek-R1-GGUF/resolve/mai +-------+------------------+-----------+--------------+---------------+------------+------------+----------+ | TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | +-------+------------------+-----------+--------------+---------------+------------+------------+----------+ -| model | DeepSeek R1 BF16 | deepseek2 | BF16 | true | 130.60 GiB | 671.03 B | 1.67 bpw | +| model | DeepSeek R1 BF16 | deepseek2 | IQ1_S | true | 130.60 GiB | 671.03 B | 1.67 bpw | +-------+------------------+-----------+--------------+---------------+------------+------------+----------+ -+---------------------------------------------------------------------------------------------------------------------------------------------------+ -| ARCHITECTURE | -+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ -| MAX CONTEXT LEN | EMBEDDING LEN | EMBEDDING GQA | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN | -+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ -| 163840 | 7168 | 1 | true | N/A | 61 | 18432 | 256 | 129280 | -+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ ++-----------------------------------------------------------------------------------------------------------------------------------+ +| ARCHITECTURE | ++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ +| MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN | ++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ +| 163840 | 7168 | true | N/A | 61 | 18432 | 256 | 129280 | ++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ +-------------------------------------------------------------------------------------------------------------------------------------------------------+ | TOKENIZER | @@ -230,9 +247,8 @@ $ gguf-parser --url="https://huggingface.co/unsloth/DeepSeek-R1-GGUF/resolve/mai | | | | | | | | | | +--------------------+-----------+-----------+----------------+------------+--------+ | | | | | | | | | | | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | +-----------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+-----------+-----------+----------------+------------+--------+ -| deepseek2 | 163840 | 2048 / 512 | Disabled | Enabled | No | Unsupported | Supported | 62 (61 + 1) | Yes | 1 + 0 + 0 | 13.01 GiB | 13.16 GiB | 61 + 1 | 762.50 GiB | 1 TB | +| deepseek2 | 163840 | 2048 / 512 | Disabled | Enabled | No | Unsupported | Supported | 62 (61 + 1) | Yes | 1 + 0 + 0 | 13.03 GiB | 13.18 GiB | 61 + 1 | 762.76 GiB | 1 TB | +-----------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+-----------+-----------+----------------+------------+--------+ - ``` #### Parse From HuggingFace @@ -251,13 +267,13 @@ $ gguf-parser --hf-repo="bartowski/Qwen2-VL-2B-Instruct-GGUF" --hf-file="Qwen2-V | model | Qwen2 VL 2B Instruct | qwen2vl | F16 | true | 2.88 GiB | 1.54 B | 16.00 bpw | +-------+----------------------+---------+--------------+---------------+----------+------------+-----------+ -+---------------------------------------------------------------------------------------------------------------------------------------------------+ -| ARCHITECTURE | -+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ -| MAX CONTEXT LEN | EMBEDDING LEN | EMBEDDING GQA | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN | -+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ -| 32768 | 1536 | 6 | true | 12 | 28 | 8960 | 0 | 151936 | -+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ ++-----------------------------------------------------------------------------------------------------------------------------------+ +| ARCHITECTURE | ++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ +| MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN | ++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ +| 32768 | 1536 | true | 12 | 28 | 8960 | 0 | 151936 | ++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ +-------------------------------------------------------------------------------------------------------------------------------------------------------+ | TOKENIZER | @@ -274,7 +290,7 @@ $ gguf-parser --hf-repo="bartowski/Qwen2-VL-2B-Instruct-GGUF" --hf-file="Qwen2-V | | | | | | | | | | +--------------------+------------+------------+----------------+----------+-----------+ | | | | | | | | | | | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | +---------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+ -| qwen2vl | 32768 | 2048 / 512 | Disabled | Enabled | No | Unsupported | Supported | 29 (28 + 1) | Yes | 1 + 0 + 0 | 213.55 MiB | 363.55 MiB | 28 + 1 | 3.35 GiB | 12.60 GiB | +| qwen2vl | 32768 | 2048 / 512 | Disabled | Enabled | No | Unsupported | Supported | 29 (28 + 1) | Yes | 1 + 0 + 0 | 236.87 MiB | 386.87 MiB | 28 + 1 | 3.65 GiB | 12.86 GiB | +---------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+ $ # Retrieve the model's metadata via split file @@ -288,13 +304,13 @@ $ gguf-parser --hf-repo="bartowski/openbuddy-llama3.3-70b-v24.1-131k-GGUF" --hf- | model | Openbuddy Llama3.3 7... | llama | Q4_0 | true | 37.35 GiB | 70.55 B | 4.55 bpw | +-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+ -+---------------------------------------------------------------------------------------------------------------------------------------------------+ -| ARCHITECTURE | -+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ -| MAX CONTEXT LEN | EMBEDDING LEN | EMBEDDING GQA | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN | -+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ -| 131072 | 8192 | 8 | true | 64 | 80 | 28672 | 0 | 128256 | -+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ ++-----------------------------------------------------------------------------------------------------------------------------------+ +| ARCHITECTURE | ++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ +| MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN | ++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ +| 131072 | 8192 | true | 64 | 80 | 28672 | 0 | 128256 | ++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ +-------------------------------------------------------------------------------------------------------------------------------------------------------+ | TOKENIZER | @@ -304,16 +320,15 @@ $ gguf-parser --hf-repo="bartowski/openbuddy-llama3.3-70b-v24.1-131k-GGUF" --hf- | gpt2 | 2 MiB | 128256 | N/A | 128000 | 128048 | N/A | N/A | N/A | N/A | 128044 | +-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+ -+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| ESTIMATE | -+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+-----------------------------------------+-------------------------------------+ -| ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | RERANKING | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 | -| | | | | | | | | | +--------------------+---------+----------+----------------+--------+-----------+ -| | | | | | | | | | | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | -+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+---------+----------+----------------+--------+-----------+ -| llama | 131072 | 2048 / 512 | Disabled | Enabled | No | Unsupported | Supported | 81 (80 + 1) | Yes | 1 + 0 + 0 | 1.04 GB | 1.11 GiB | 80 + 1 | 40 GiB | 93.36 GiB | -+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+---------+----------+----------------+--------+-----------+ - ++---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| ESTIMATE | ++-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+-----------------------------------------+----------------------------------------+ +| ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | RERANKING | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 | +| | | | | | | | | | +--------------------+---------+----------+----------------+-----------+-----------+ +| | | | | | | | | | | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | ++-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+---------+----------+----------------+-----------+-----------+ +| llama | 131072 | 2048 / 512 | Disabled | Enabled | No | Unsupported | Supported | 81 (80 + 1) | Yes | 1 + 0 + 0 | 1.06 GB | 1.13 GiB | 80 + 1 | 40.26 GiB | 93.62 GiB | ++-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+---------+----------+----------------+-----------+-----------+ ``` #### Parse From ModelScope @@ -332,13 +347,13 @@ $ gguf-parser --ms-repo="unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF" --ms-file="De | model | DeepSeek R1 Distill ... | qwen2 | F16 | true | 14.19 GiB | 7.62 B | 16.00 bpw | +-------+-------------------------+-------+--------------+---------------+-----------+------------+-----------+ -+---------------------------------------------------------------------------------------------------------------------------------------------------+ -| ARCHITECTURE | -+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ -| MAX CONTEXT LEN | EMBEDDING LEN | EMBEDDING GQA | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN | -+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ -| 131072 | 3584 | 7 | true | 28 | 28 | 18944 | 0 | 152064 | -+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ ++-----------------------------------------------------------------------------------------------------------------------------------+ +| ARCHITECTURE | ++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ +| MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN | ++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ +| 131072 | 3584 | true | 28 | 28 | 18944 | 0 | 152064 | ++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ +-------------------------------------------------------------------------------------------------------------------------------------------------------+ | TOKENIZER | @@ -348,16 +363,15 @@ $ gguf-parser --ms-repo="unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF" --ms-file="De | gpt2 | 2.47 MiB | 152064 | N/A | 151646 | 151643 | N/A | N/A | N/A | N/A | 151654 | +-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+ -+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| ESTIMATE | -+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+----------------------------------------------+-------------------------------------+ -| ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | RERANKING | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 | -| | | | | | | | | | +--------------------+------------+------------+----------------+--------+-----------+ -| | | | | | | | | | | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | -+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+--------+-----------+ -| qwen2 | 131072 | 2048 / 512 | Disabled | Enabled | No | Unsupported | Supported | 29 (28 + 1) | Yes | 1 + 0 + 0 | 654.26 MiB | 804.26 MiB | 28 + 1 | 7 GiB | 27.69 GiB | -+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+--------+-----------+ - ++-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| ESTIMATE | ++-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+----------------------------------------------+---------------------------------------+ +| ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | RERANKING | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 | +| | | | | | | | | | +--------------------+------------+------------+----------------+----------+-----------+ +| | | | | | | | | | | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | ++-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+ +| qwen2 | 131072 | 2048 / 512 | Disabled | Enabled | No | Unsupported | Supported | 29 (28 + 1) | Yes | 1 + 0 + 0 | 677.44 MiB | 827.44 MiB | 28 + 1 | 7.30 GiB | 27.99 GiB | ++-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+ ``` #### Parse From Ollama Library @@ -368,21 +382,21 @@ $ gguf-parser --ms-repo="unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF" --ms-file="De ```shell $ gguf-parser --ol-model="llama3.3" -+--------------------------------------------------------------------------------------------------------------+ -| METADATA | -+-------+-------------------------+-------+----------------+---------------+-----------+------------+----------+ -| TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | -+-------+-------------------------+-------+----------------+---------------+-----------+------------+----------+ -| model | Llama 3.1 70B Instru... | llama | IQ2_XXS/Q4_K_M | true | 39.59 GiB | 70.55 B | 4.82 bpw | -+-------+-------------------------+-------+----------------+---------------+-----------+------------+----------+ - -+---------------------------------------------------------------------------------------------------------------------------------------------------+ -| ARCHITECTURE | -+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ -| MAX CONTEXT LEN | EMBEDDING LEN | EMBEDDING GQA | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN | -+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ -| 131072 | 8192 | 8 | true | 64 | 80 | 28672 | 0 | 128256 | -+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ ++------------------------------------------------------------------------------------------------------------+ +| METADATA | ++-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+ +| TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | ++-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+ +| model | Llama 3.1 70B Instru... | llama | Q4_K_M | true | 39.59 GiB | 70.55 B | 4.82 bpw | ++-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+ + ++-----------------------------------------------------------------------------------------------------------------------------------+ +| ARCHITECTURE | ++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ +| MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN | ++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ +| 131072 | 8192 | true | 64 | 80 | 28672 | 0 | 128256 | ++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ +-------------------------------------------------------------------------------------------------------------------------------------------------------+ | TOKENIZER | @@ -392,34 +406,34 @@ $ gguf-parser --ol-model="llama3.3" | gpt2 | 2 MiB | 128256 | N/A | 128000 | 128009 | N/A | N/A | N/A | N/A | N/A | +-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+ -+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| ESTIMATE | -+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+-----------------------------------------+-------------------------------------+ -| ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | RERANKING | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 | -| | | | | | | | | | +--------------------+---------+----------+----------------+--------+-----------+ -| | | | | | | | | | | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | -+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+---------+----------+----------------+--------+-----------+ -| llama | 131072 | 2048 / 512 | Disabled | Enabled | No | Unsupported | Supported | 81 (80 + 1) | Yes | 1 + 0 + 0 | 1.04 GB | 1.11 GiB | 80 + 1 | 40 GiB | 95.60 GiB | -+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+---------+----------+----------------+--------+-----------+ ++---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| ESTIMATE | ++-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+-----------------------------------------+----------------------------------------+ +| ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | RERANKING | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 | +| | | | | | | | | | +--------------------+---------+----------+----------------+-----------+-----------+ +| | | | | | | | | | | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | ++-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+---------+----------+----------------+-----------+-----------+ +| llama | 131072 | 2048 / 512 | Disabled | Enabled | No | Unsupported | Supported | 81 (80 + 1) | Yes | 1 + 0 + 0 | 1.06 GB | 1.13 GiB | 80 + 1 | 40.26 GiB | 95.86 GiB | ++-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+---------+----------+----------------+-----------+-----------+ $ # Ollama Model includes the preset params and other artifacts, like multimodal projectors or LoRA adapters, $ # you can get the usage of Ollama running by using `--ol-usage` option. -$ +--------------------------------------------------------------------------------------------------------------+ -| METADATA | -+-------+-------------------------+-------+----------------+---------------+-----------+------------+----------+ -| TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | -+-------+-------------------------+-------+----------------+---------------+-----------+------------+----------+ -| model | Llama 3.1 70B Instru... | llama | IQ2_XXS/Q4_K_M | true | 39.59 GiB | 70.55 B | 4.82 bpw | -+-------+-------------------------+-------+----------------+---------------+-----------+------------+----------+ - -+---------------------------------------------------------------------------------------------------------------------------------------------------+ -| ARCHITECTURE | -+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ -| MAX CONTEXT LEN | EMBEDDING LEN | EMBEDDING GQA | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN | -+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ -| 131072 | 8192 | 8 | true | 64 | 80 | 28672 | 0 | 128256 | -+-----------------+---------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ ++------------------------------------------------------------------------------------------------------------+ +| METADATA | ++-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+ +| TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | ++-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+ +| model | Llama 3.1 70B Instru... | llama | Q4_K_M | true | 39.59 GiB | 70.55 B | 4.82 bpw | ++-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+ + ++-----------------------------------------------------------------------------------------------------------------------------------+ +| ARCHITECTURE | ++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ +| MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN | ++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ +| 131072 | 8192 | true | 64 | 80 | 28672 | 0 | 128256 | ++-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ +-------------------------------------------------------------------------------------------------------------------------------------------------------+ | TOKENIZER | @@ -436,9 +450,8 @@ $ +----------------------------------------------------------------------------- | | | | | | | | | | +--------------------+------------+------------+----------------+------------+-----------+ | | | | | | | | | | | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | +-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+------------+-----------+ -| llama | 2048 | 2048 / 512 | Disabled | Enabled | No | Unsupported | Supported | 81 (80 + 1) | Yes | 1 + 0 + 0 | 232.08 MiB | 382.08 MiB | 80 + 1 | 640.52 MiB | 40.23 GiB | +| llama | 2048 | 2048 / 512 | Disabled | Enabled | No | Unsupported | Supported | 81 (80 + 1) | Yes | 1 + 0 + 0 | 255.27 MiB | 405.27 MiB | 80 + 1 | 906.50 MiB | 40.49 GiB | +-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+------------+-----------+ - ``` #### Others @@ -456,13 +469,13 @@ $ gguf-parser --hf-repo="gpustack/FLUX.1-dev-GGUF" --hf-file="FLUX.1-dev-FP16.gg | model | N/A | diffusion | F16 | true | 31.79 GiB | 17 B | 16.06 bpw | +-------+------+-----------+--------------+---------------+-----------+------------+-----------+ -+-------------------------------------------------------------------------------------+ -| ARCHITECTURE | -+----------------+-------------------------------------------------+------------------+ -| DIFFUSION ARCH | CONDITIONERS | AUTOENCODER | -+----------------+-------------------------------------------------+------------------+ -| FLUX.1 | OpenAI CLIP ViT-L/14 (F16), Google T5-xxl (F16) | FLUX.1 VAE (F16) | -+----------------+-------------------------------------------------+------------------+ ++----------------------------------------------------------------------------------------------------------+ +| ARCHITECTURE | ++----------------+---------------------------------------------------------------+-------------------------+ +| DIFFUSION ARCH | CONDITIONERS | AUTOENCODER | ++----------------+---------------------------------------------------------------+-------------------------+ +| FLUX.1 | OpenAI CLIP ViT-L/14 (MOSTLY_F16), Google T5-xxl (MOSTLY_F16) | FLUX.1 VAE (MOSTLY_F16) | ++----------------+---------------------------------------------------------------+-------------------------+ +---------------------------------------------------------------------------------------------------------------------------+ | ESTIMATE | @@ -471,7 +484,7 @@ $ gguf-parser --hf-repo="gpustack/FLUX.1-dev-GGUF" --hf-file="FLUX.1-dev-FP16.gg | | | | | +------------+------------+-----------+-----------+ | | | | | | UMA | NONUMA | UMA | NONUMA | +--------+-----------------+-------------+---------------+----------------+------------+------------+-----------+-----------+ -| flux_1 | Disabled | Unsupported | Supported | Yes | 333.45 MiB | 483.45 MiB | 31.89 GiB | 41.15 GiB | +| flux_1 | Disabled | Unsupported | Supported | Yes | 343.89 MiB | 493.89 MiB | 31.89 GiB | 41.15 GiB | +--------+-----------------+-------------+---------------+----------------+------------+------------+-----------+-----------+ $ # Parse FLUX.1-dev Model without offload Conditioner and Autoencoder @@ -484,13 +497,13 @@ $ gguf-parser --hf-repo="gpustack/FLUX.1-dev-GGUF" --hf-file="FLUX.1-dev-FP16.gg | model | N/A | diffusion | F16 | true | 31.79 GiB | 17 B | 16.06 bpw | +-------+------+-----------+--------------+---------------+-----------+------------+-----------+ -+-------------------------------------------------------------------------------------+ -| ARCHITECTURE | -+----------------+-------------------------------------------------+------------------+ -| DIFFUSION ARCH | CONDITIONERS | AUTOENCODER | -+----------------+-------------------------------------------------+------------------+ -| FLUX.1 | OpenAI CLIP ViT-L/14 (F16), Google T5-xxl (F16) | FLUX.1 VAE (F16) | -+----------------+-------------------------------------------------+------------------+ ++----------------------------------------------------------------------------------------------------------+ +| ARCHITECTURE | ++----------------+---------------------------------------------------------------+-------------------------+ +| DIFFUSION ARCH | CONDITIONERS | AUTOENCODER | ++----------------+---------------------------------------------------------------+-------------------------+ +| FLUX.1 | OpenAI CLIP ViT-L/14 (MOSTLY_F16), Google T5-xxl (MOSTLY_F16) | FLUX.1 VAE (MOSTLY_F16) | ++----------------+---------------------------------------------------------------+-------------------------+ +-------------------------------------------------------------------------------------------------------------------------+ | ESTIMATE | @@ -499,7 +512,7 @@ $ gguf-parser --hf-repo="gpustack/FLUX.1-dev-GGUF" --hf-file="FLUX.1-dev-FP16.gg | | | | | +-----------+-----------+-----------+-----------+ | | | | | | UMA | NONUMA | UMA | NONUMA | +--------+-----------------+-------------+---------------+----------------+-----------+-----------+-----------+-----------+ -| flux_1 | Disabled | Unsupported | Supported | Yes | 16.43 GiB | 16.58 GiB | 22.29 GiB | 25.05 GiB | +| flux_1 | Disabled | Unsupported | Supported | Yes | 16.44 GiB | 16.59 GiB | 22.29 GiB | 25.05 GiB | +--------+-----------------+-------------+---------------+----------------+-----------+-----------+-----------+-----------+ $ # Parse FLUX.1-dev Model with Autoencoder tiling @@ -512,13 +525,13 @@ $ gguf-parser --hf-repo="gpustack/FLUX.1-dev-GGUF" --hf-file="FLUX.1-dev-FP16.gg | model | N/A | diffusion | F16 | true | 31.79 GiB | 17 B | 16.06 bpw | +-------+------+-----------+--------------+---------------+-----------+------------+-----------+ -+-------------------------------------------------------------------------------------+ -| ARCHITECTURE | -+----------------+-------------------------------------------------+------------------+ -| DIFFUSION ARCH | CONDITIONERS | AUTOENCODER | -+----------------+-------------------------------------------------+------------------+ -| FLUX.1 | OpenAI CLIP ViT-L/14 (F16), Google T5-xxl (F16) | FLUX.1 VAE (F16) | -+----------------+-------------------------------------------------+------------------+ ++----------------------------------------------------------------------------------------------------------+ +| ARCHITECTURE | ++----------------+---------------------------------------------------------------+-------------------------+ +| DIFFUSION ARCH | CONDITIONERS | AUTOENCODER | ++----------------+---------------------------------------------------------------+-------------------------+ +| FLUX.1 | OpenAI CLIP ViT-L/14 (MOSTLY_F16), Google T5-xxl (MOSTLY_F16) | FLUX.1 VAE (MOSTLY_F16) | ++----------------+---------------------------------------------------------------+-------------------------+ +---------------------------------------------------------------------------------------------------------------------------+ | ESTIMATE | @@ -527,7 +540,7 @@ $ gguf-parser --hf-repo="gpustack/FLUX.1-dev-GGUF" --hf-file="FLUX.1-dev-FP16.gg | | | | | +------------+------------+-----------+-----------+ | | | | | | UMA | NONUMA | UMA | NONUMA | +--------+-----------------+-------------+---------------+----------------+------------+------------+-----------+-----------+ -| flux_1 | Disabled | Unsupported | Supported | Yes | 333.45 MiB | 483.45 MiB | 31.89 GiB | 36.28 GiB | +| flux_1 | Disabled | Unsupported | Supported | Yes | 343.89 MiB | 493.89 MiB | 31.89 GiB | 36.28 GiB | +--------+-----------------+-------------+---------------+----------------+------------+------------+-----------+-----------+ $ # Parse FLUX.1-dev Model with multiple devices offloading @@ -541,13 +554,13 @@ $ gguf-parser --hf-repo="gpustack/FLUX.1-dev-GGUF" --hf-file="FLUX.1-dev-FP16.gg | model | N/A | diffusion | F16 | true | 31.79 GiB | 17 B | 16.06 bpw | +-------+------+-----------+--------------+---------------+-----------+------------+-----------+ -+-------------------------------------------------------------------------------------+ -| ARCHITECTURE | -+----------------+-------------------------------------------------+------------------+ -| DIFFUSION ARCH | CONDITIONERS | AUTOENCODER | -+----------------+-------------------------------------------------+------------------+ -| FLUX.1 | OpenAI CLIP ViT-L/14 (F16), Google T5-xxl (F16) | FLUX.1 VAE (F16) | -+----------------+-------------------------------------------------+------------------+ ++----------------------------------------------------------------------------------------------------------+ +| ARCHITECTURE | ++----------------+---------------------------------------------------------------+-------------------------+ +| DIFFUSION ARCH | CONDITIONERS | AUTOENCODER | ++----------------+---------------------------------------------------------------+-------------------------+ +| FLUX.1 | OpenAI CLIP ViT-L/14 (MOSTLY_F16), Google T5-xxl (MOSTLY_F16) | FLUX.1 VAE (MOSTLY_F16) | ++----------------+---------------------------------------------------------------+-------------------------+ +-----------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | ESTIMATE | @@ -556,31 +569,32 @@ $ gguf-parser --hf-repo="gpustack/FLUX.1-dev-GGUF" --hf-file="FLUX.1-dev-FP16.gg | | | | | +------------+------------+----------+----------+------------+--------+-----------+-----------+ | | | | | | UMA | NONUMA | UMA | NONUMA | UMA | NONUMA | UMA | NONUMA | +--------+-----------------+-------------+---------------+----------------+------------+------------+----------+----------+------------+--------+-----------+-----------+ -| flux_1 | Disabled | Unsupported | Supported | Yes | 333.45 MiB | 483.45 MiB | 9.34 GiB | 9.60 GiB | 259.96 MiB | 7 GiB | 22.29 GiB | 25.05 GiB | +| flux_1 | Disabled | Unsupported | Supported | Yes | 343.89 MiB | 493.89 MiB | 9.34 GiB | 9.60 GiB | 259.96 MiB | 7 GiB | 22.29 GiB | 25.05 GiB | +--------+-----------------+-------------+---------------+----------------+------------+------------+----------+----------+------------+--------+-----------+-----------+ - ``` ##### Parse None Model ```shell $ # Parse Multi-Modal Projector -$ gguf-parser --hf-repo="bartowski/Qwen2-VL-72B-Instruct-GGUF" --hf-file="mmproj-Qwen2-VL-72B-Instruct-f16.gguf" -+---------------------------------------------------------------------------------------------------------------+ -| METADATA | -+-----------+-------------------------+------+--------------+---------------+----------+------------+-----------+ -| TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | -+-----------+-------------------------+------+--------------+---------------+----------+------------+-----------+ -| projector | Qwen2-VL-72B-Instruc... | clip | F16 | true | 1.30 GiB | 699.36 M | 16.01 bpw | -+-----------+-------------------------+------+--------------+---------------+----------+------------+-----------+ - -+----------------------------------------------------------------------+ -| ARCHITECTURE | -+----------------+---------------+--------+------------------+---------+ -| PROJECTOR TYPE | EMBEDDING LEN | LAYERS | FEED FORWARD LEN | ENCODER | -+----------------+---------------+--------+------------------+---------+ -| qwen2vl_merger | 1280 | 32 | 0 | Vision | -+----------------+---------------+--------+------------------+---------+ +$ gguf-parser --hf-repo="unsloth/Qwen2.5-Omni-3B-GGUF" --hf-file="mmproj-F32.gguf" ++-------------------------------------------------------------------------------------------------------+ +| METADATA | ++-----------+-----------------+------+--------------+---------------+----------+------------+-----------+ +| TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | ++-----------+-----------------+------+--------------+---------------+----------+------------+-----------+ +| projector | Qwen2.5-Omni-3B | clip | F32 | true | 4.86 GiB | 1.31 B | 31.93 bpw | ++-----------+-----------------+------+--------------+---------------+----------+------------+-----------+ + ++-------------------------------------------------------------------------------------------------------------------------+ +| ARCHITECTURE | ++----------------+-------------------------------+-----------------+-------------------------------------+----------------+ +| PROJECTOR TYPE | EMBEDDING LEN | LAYERS | FEED FORWARD LEN | ENCODER | +| +---------------+---------------+--------+--------+------------------+------------------+ | +| | VISION | AUDIO | VISION | AUDIO | VISION | AUDIO | | ++----------------+---------------+---------------+--------+--------+------------------+------------------+----------------+ +| qwen2.5o | 1280 | 1280 | 32 | 32 | 1280 | 5120 | Vision & Audio | ++----------------+---------------+---------------+--------+--------+------------------+------------------+----------------+ $ # Parse LoRA Adapter $ gguf-parser --hf-repo="ngxson/test_gguf_lora_adapter" --hf-file="lora-Llama-3-Instruct-abliteration-LoRA-8B-f16.gguf" @@ -599,7 +613,6 @@ $ gguf-parser --hf-repo="ngxson/test_gguf_lora_adapter" --hf-file="lora-Llama-3- +--------------+------------+ | lora | 32 | +--------------+------------+ - ``` ### Estimate @@ -641,7 +654,7 @@ flowchart TD ``` ```shell -$ gguf-parser --hf-repo="hierholzer/Llama-3.1-70B-Instruct-GGUF" --hf-file="Llama-3.1-70B-Instruct-Q4_K_M.gguf" --skip-metadata --skip-architecture --skip-tokenizer --ctx-size=1024 --tensor-split="8,10" --in-short +$ gguf-parser --hf-repo="hierholzer/Llama-3.1-70B-Instruct-GGUF" --hf-file="Llama-3.1-70B-Instruct-Q4_K_M.gguf" --ctx-size=1024 --tensor-split="8,10" --estimate --in-short +------------------------------------------------------------------------------------------------------------------------------+ | ESTIMATE | +----------------------------------------------+--------------------------------------+----------------------------------------+ @@ -649,9 +662,8 @@ $ gguf-parser --hf-repo="hierholzer/Llama-3.1-70B-Instruct-GGUF" --hf-file="Llam +--------------------+------------+------------+----------------+---------+-----------+----------------+-----------+-----------+ | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | +--------------------+------------+------------+----------------+---------+-----------+----------------+-----------+-----------+ -| 1 + 0 + 0 | 238.08 MiB | 388.08 MiB | 36 + 0 | 144 MiB | 17.83 GiB | 44 + 1 | 22.01 GiB | 22.57 GiB | +| 1 + 0 + 0 | 249.27 MiB | 399.27 MiB | 36 + 0 | 144 MiB | 17.83 GiB | 44 + 1 | 22.27 GiB | 22.83 GiB | +--------------------+------------+------------+----------------+---------+-----------+----------------+-----------+-----------+ - ``` Based on the output provided, serving the `hierholzer/Llama-3.1-70B-Instruct-GGUF` model on `host1` has the following @@ -659,9 +671,9 @@ resource consumption: | Host | Available RAM | Request RAM | Available VRAM | Request VRAM | Result | |-----------------------|---------------|-------------|----------------|--------------|------------| -| host1 | ENOUGH | 388.08 MiB | | | :thumbsup: | -| host1 (NVIDIA 4080 0) | | | 8 GiB | 17.79 GiB | | -| host1 (NVIDIA 4080 1) | | | 10 GiB | 22.51 GiB | | +| host1 | ENOUGH | 399.27 MiB | | | :thumbsup: | +| host1 (NVIDIA 4080 0) | | | 8 GiB | 17.83 GiB | | +| host1 (NVIDIA 4080 1) | | | 10 GiB | 22.83 GiB | | It appears that running the model on `host1` alone is not feasible. @@ -694,7 +706,7 @@ flowchart TD ``` ```shell -$ gguf-parser --hf-repo="hierholzer/Llama-3.1-70B-Instruct-GGUF" --hf-file="Llama-3.1-70B-Instruct-Q4_K_M.gguf" --skip-metadata --skip-architecture --skip-tokenizer --ctx-size=1024 --tensor-split="8,10,12,6" --rpc="host1:50052,host1:50053,host2:50052,host3:50052" --in-short +$ gguf-parser --hf-repo="hierholzer/Llama-3.1-70B-Instruct-GGUF" --hf-file="Llama-3.1-70B-Instruct-Q4_K_M.gguf" --ctx-size=1024 --tensor-split="8,10,12,6" --rpc="host1:50052,host1:50053,host2:50052,host3:50052" --estimate --in-short +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | ESTIMATE | +----------------------------------------------+----------------------------------------------+----------------------------------------------+----------------------------------------------+----------------------------------------------+ @@ -702,9 +714,8 @@ $ gguf-parser --hf-repo="hierholzer/Llama-3.1-70B-Instruct-GGUF" --hf-file="Llam +--------------------+------------+------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+ | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | +--------------------+------------+------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+ -| 1 + 0 + 0 | 238.08 MiB | 388.08 MiB | 18 + 0 | 8.85 GiB | 9.28 GiB | 23 + 0 | 10.88 GiB | 11.32 GiB | 27 + 0 | 12.75 GiB | 13.19 GiB | 12 + 1 | 6.87 GiB | 7.38 GiB | +| 1 + 0 + 0 | 249.27 MiB | 399.27 MiB | 18 + 0 | 8.85 GiB | 9.28 GiB | 23 + 0 | 10.88 GiB | 11.32 GiB | 27 + 0 | 12.75 GiB | 13.19 GiB | 12 + 1 | 7.13 GiB | 7.64 GiB | +--------------------+------------+------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+ - ``` According to the output provided, serving the `hierholzer/Llama-3.1-70B-Instruct-GGUF` model on `host4` results in the @@ -712,11 +723,11 @@ following resource consumption: | Host | Available RAM | Request RAM | Available VRAM | Request VRAM | Result | |-----------------------|---------------|-------------|----------------|--------------|------------| -| host4 | 11 GiB | 388.08 MiB | | | :thumbsup: | +| host4 | 11 GiB | 399.27 MiB | | | :thumbsup: | | host1 (NVIDIA 4080 0) | | | 8 GiB | 9.28 GiB | | | host1 (NVIDIA 4080 1) | | | 10 GiB | 11.32 GiB | | | host2 (NVIDIA 4090) | | | 12 GiB | 13.19 GiB | | -| host3 (Apple M1 Max) | ENOUGH | | 6 GiB | 6.87 GiB | | +| host3 (Apple M1 Max) | ENOUGH | | 6 GiB | 7.13 GiB | | It seems that the model cannot be served on `host4`, even with all layers offloaded to `host1`, `host2`, and `host3`. @@ -746,17 +757,16 @@ flowchart TD ``` ```shell -$ gguf-parser --hf-repo="hierholzer/Llama-3.1-70B-Instruct-GGUF" --hf-file="Llama-3.1-70B-Instruct-Q4_K_M.gguf" --skip-metadata --skip-architecture --skip-tokenizer --ctx-size=1024 --tensor-split="11,12,8,10,6" --rpc="host4:50052,host2:50052,host1:50052,host1:50053" --in-short -+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| ESTIMATE | -+----------------------------------------------+----------------------------------------------+----------------------------------------------+----------------------------------------------+----------------------------------------------+---------------------------------------+ -| RAM | RPC 0 (V)RAM | RPC 1 (V)RAM | RPC 2 (V)RAM | RPC 3 (V)RAM | VRAM 0 | -+--------------------+------------+------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+-----------+----------+ -| LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | -+--------------------+------------+------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+-----------+----------+ -| 1 + 0 + 0 | 238.08 MiB | 388.08 MiB | 19 + 0 | 9.36 GiB | 9.79 GiB | 21 + 0 | 9.92 GiB | 10.35 GiB | 14 + 0 | 6.57 GiB | 7.01 GiB | 17 + 0 | 8.11 GiB | 8.54 GiB | 9 + 1 | 36.52 MiB | 5.91 GiB | -+--------------------+------------+------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+-----------+----------+ - +$ gguf-parser --hf-repo="hierholzer/Llama-3.1-70B-Instruct-GGUF" --hf-file="Llama-3.1-70B-Instruct-Q4_K_M.gguf" --ctx-size=1024 --tensor-split="11,12,8,10,6" --rpc="host4:50052,host2:50052,host1:50052,host1:50053" --estimate --in-short ++-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| ESTIMATE | ++----------------------------------------------+----------------------------------------------+----------------------------------------------+----------------------------------------------+----------------------------------------------+----------------------------------------+ +| RAM | RPC 0 (V)RAM | RPC 1 (V)RAM | RPC 2 (V)RAM | RPC 3 (V)RAM | VRAM 0 | ++--------------------+------------+------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+------------+----------+ +| LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | ++--------------------+------------+------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+------------+----------+ +| 1 + 0 + 0 | 249.27 MiB | 399.27 MiB | 19 + 0 | 9.36 GiB | 9.79 GiB | 21 + 0 | 9.92 GiB | 10.35 GiB | 14 + 0 | 6.57 GiB | 7.01 GiB | 17 + 0 | 8.11 GiB | 8.54 GiB | 9 + 1 | 302.50 MiB | 6.16 GiB | ++--------------------+------------+------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+------------+----------+ ``` According to the output provided, serving the `hierholzer/Llama-3.1-70B-Instruct-GGUF` model on `host3` results in the @@ -764,12 +774,12 @@ following resource consumption: | Host | Available RAM | Request RAM | Available VRAM | Request VRAM | Result | |-----------------------|---------------|-------------|----------------|--------------|------------| -| host3 (Apple M1 Max) | ENOUGH | 238.08 MiB | | | :thumbsup: | +| host3 (Apple M1 Max) | ENOUGH | 249.27 MiB | | | :thumbsup: | | host4 | 11 GiB | 9.79 GiB | | | :thumbsup: | -| host2 (NVIDIA 4090) | | | 12 GiB | 10.36 GiB | :thumbsup: | +| host2 (NVIDIA 4090) | | | 12 GiB | 10.35 GiB | :thumbsup: | | host1 (NVIDIA 4080 0) | | | 8 GiB | 7.01 GiB | :thumbsup: | | host1 (NVIDIA 4080 1) | | | 10 GiB | 8.54 GiB | :thumbsup: | -| host3 (Apple M1 Max) | | | 6 GiB | 36.52 MiB | :thumbsup: | +| host3 (Apple M1 Max) | | | 6 GiB | 302.50 MiB | :thumbsup: | Now, the model can be successfully served on `host3`, with all layers offloaded to `host1`, `host2`, and `host4`. @@ -830,12 +840,12 @@ example and estimate the maximum tokens per second for Apple Silicon M-series us ```shell $ # Estimate full offloaded Q8_0 model -$ gguf-parser --hf-repo TheBloke/LLaMA-7b-GGUF --hf-file llama-7b.Q8_0.gguf --skip-metadata --skip-architecture --skip-tokenizer --in-short \ +$ gguf-parser --hf-repo TheBloke/LLaMA-7b-GGUF --hf-file llama-7b.Q8_0.gguf --estimate --in-short \ -c 512 \ --device-metric ";,;" $ # Estimate full offloaded Q4_0 model -$ gguf-parser --hf-repo TheBloke/LLaMA-7b-GGUF --hf-file llama-7b.Q4_0.gguf --skip-metadata --skip-architecture --skip-tokenizer --in-short \ +$ gguf-parser --hf-repo TheBloke/LLaMA-7b-GGUF --hf-file llama-7b.Q4_0.gguf --estimate --in-short \ -c 512 \ --device-metric ";,;" ``` @@ -886,7 +896,7 @@ $ # --device-metric "224GFLOPS;819.2GBps" <-- Apple Mac Studio 0 CPU FLO $ # --device-metric "27.2TFLOPS;819.2GBps;40Gbps" <-- Apple Mac Studio 1 (RPC 0) iGPU FLOPS, VRAM Bandwidth, and Thunderbolt Bandwidth $ # --device-metric "27.2TFLOPS;819.2GBps;40Gbps" <-- Apple Mac Studio 2 (RPC 1) iGPU FLOPS, VRAM Bandwidth, and Thunderbolt Bandwidth $ # --device-metric "27.2TFLOPS;819.2GBps" <-- Apple Mac Studio 0 iGPU FLOPS and VRAM Bandwidth -$ gguf-parser --hf-repo leafspark/Meta-Llama-3.1-405B-Instruct-GGUF --hf-file Llama-3.1-405B-Instruct.Q4_0.gguf/Llama-3.1-405B-Instruct.Q4_0-00001-of-00012.gguf --skip-metadata --skip-architecture --skip-tokenizer --in-short \ +$ gguf-parser --hf-repo leafspark/Meta-Llama-3.1-405B-Instruct-GGUF --hf-file Llama-3.1-405B-Instruct.Q4_0.gguf/Llama-3.1-405B-Instruct.Q4_0-00001-of-00012.gguf --estimate --in-short \ --no-mmap \ -c 512 \ --rpc host1:port,host2:port \ @@ -923,7 +933,7 @@ $ # --device-metric "510.4GFLOPS;96GBps" <-- Intel i5-14600k CPU FLOPS $ # --device-metric "27.2TFLOPS;819.2GBps;40Gbps" <-- Apple Mac Studio (M2) (RPC 0) iGPU FLOPS, VRAM Bandwidth, and Thunderbolt Bandwidth $ # --device-metric "48.74TFLOPS;736.3GBps;64GBps" <-- NVIDIA GeForce RTX 0 4080 GPU FLOPS, VRAM Bandwidth, and PCIe 5.0 x16 Bandwidth $ # --device-metric "48.74TFLOPS;736.3GBps;8GBps" <-- NVIDIA GeForce RTX 1 4080 GPU FLOPS, VRAM Bandwidth, and PCIe 4.0 x4 Bandwidth -$ gguf-parser --hf-repo Qwen/Qwen2.5-72B-Instruct-GGUF --hf-file qwen2.5-72b-instruct-q4_k_m-00001-of-00012.gguf --skip-metadata --skip-architecture --skip-tokenizer --in-short \ +$ gguf-parser --hf-repo Qwen/Qwen2.5-72B-Instruct-GGUF --hf-file qwen2.5-72b-instruct-q4_k_m-00001-of-00012.gguf --estimate --in-short \ --no-mmap \ -c 8192 \ --rpc host:port \ @@ -946,23 +956,22 @@ $ gguf-parser --hf-repo Qwen/Qwen2.5-72B-Instruct-GGUF --hf-file qwen2.5-72b-ins #### Full Layers Offload (default) ```shell -$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --skip-metadata --skip-architecture --skip-tokenizer --in-short -+--------------------------------------------------------------------------------------+ -| ESTIMATE | -+----------------------------------------------+---------------------------------------+ -| RAM | VRAM 0 | -+--------------------+------------+------------+----------------+---------+------------+ -| LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | -+--------------------+------------+------------+----------------+---------+------------+ -| 1 + 0 + 0 | 652.53 MiB | 802.53 MiB | 126 + 1 | 126 GiB | 246.59 GiB | -+--------------------+------------+------------+----------------+---------+------------+ - +$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --estimate --in-short ++-------------------------------------------------------------------------------------+ +| ESTIMATE | ++------------------------------------------+------------------------------------------+ +| RAM | VRAM 0 | ++--------------------+----------+----------+----------------+------------+------------+ +| LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | ++--------------------+----------+----------+----------------+------------+------------+ +| 1 + 0 + 0 | 1.63 GiB | 1.78 GiB | 126 + 1 | 126.28 GiB | 246.86 GiB | ++--------------------+----------+----------+----------------+------------+------------+ ``` #### Zero Layers Offload ```shell -$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --skip-metadata --skip-architecture --skip-tokenizer --gpu-layers=0 --in-short +$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --gpu-layers=0 --estimate --in-short +------------------------------------------------------------------------------------+ | ESTIMATE | +----------------------------------------------+-------------------------------------+ @@ -970,25 +979,23 @@ $ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-4 +--------------------+------------+------------+----------------+--------+-----------+ | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | +--------------------+------------+------------+----------------+--------+-----------+ -| 1 + 126 + 1 | 126.37 GiB | 126.52 GiB | 0 + 0 | 0 B | 32.34 GiB | +| 1 + 126 + 1 | 127.64 GiB | 127.79 GiB | 0 + 0 | 0 B | 33.62 GiB | +--------------------+------------+------------+----------------+--------+-----------+ - ``` #### Specific Layers Offload ```shell -$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --skip-metadata --skip-architecture --skip-tokenizer --gpu-layers=10 --in-short -+------------------------------------------------------------------------------------+ -| ESTIMATE | -+----------------------------------------------+-------------------------------------+ -| RAM | VRAM 0 | -+--------------------+------------+------------+----------------+--------+-----------+ -| LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | -+--------------------+------------+------------+----------------+--------+-----------+ -| 1 + 116 + 1 | 116.64 GiB | 116.78 GiB | 10 + 0 | 10 GiB | 50.39 GiB | -+--------------------+------------+------------+----------------+--------+-----------+ - +$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --gpu-layers=10 --estimate --in-short ++----------------------------------------------------------------------------------+ +| ESTIMATE | ++----------------------------------------------+-----------------------------------+ +| RAM | VRAM 0 | ++--------------------+------------+------------+----------------+--------+---------+ +| LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | ++--------------------+------------+------------+----------------+--------+---------+ +| 1 + 126 + 1 | 127.64 GiB | 127.79 GiB | 0 + 0 | 0 B | 250 MiB | ++--------------------+------------+------------+----------------+--------+---------+ ``` #### Specific Context Size @@ -998,7 +1005,7 @@ By default, the context size retrieved from the model's metadata. Use `--ctx-size` to specify the context size. ```shell -$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --skip-metadata --skip-architecture --skip-tokenizer --ctx-size=4096 --in-short +$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --ctx-size=4096 --estimate --in-short +--------------------------------------------------------------------------------------+ | ESTIMATE | +----------------------------------------------+---------------------------------------+ @@ -1008,7 +1015,6 @@ $ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-4 +--------------------+------------+------------+----------------+----------+-----------+ | 1 + 0 + 0 | 404.53 MiB | 554.53 MiB | 126 + 1 | 3.94 GiB | 93.28 GiB | +--------------------+------------+------------+----------------+----------+-----------+ - ``` #### Enable Flash Attention @@ -1023,17 +1029,16 @@ Please note that not all models support Flash Attention, if the model does not s Disabled" even if you enable it. ```shell -$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --skip-metadata --skip-architecture --skip-tokenizer --flash-attention --in-short -+--------------------------------------------------------------------------------------+ -| ESTIMATE | -+----------------------------------------------+---------------------------------------+ -| RAM | VRAM 0 | -+--------------------+------------+------------+----------------+---------+------------+ -| LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | -+--------------------+------------+------------+----------------+---------+------------+ -| 1 + 0 + 0 | 620.53 MiB | 770.53 MiB | 126 + 1 | 126 GiB | 215.70 GiB | -+--------------------+------------+------------+----------------+---------+------------+ - +$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --flash-attention --estimate --in-short ++-------------------------------------------------------------------------------------+ +| ESTIMATE | ++------------------------------------------+------------------------------------------+ +| RAM | VRAM 0 | ++--------------------+----------+----------+----------------+------------+------------+ +| LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | ++--------------------+----------+----------+----------------+------------+------------+ +| 1 + 0 + 0 | 1.63 GiB | 1.78 GiB | 126 + 1 | 126.28 GiB | 215.98 GiB | ++--------------------+----------+----------+----------------+------------+------------+ ``` #### Disable MMap @@ -1049,7 +1054,7 @@ Please note that some models require loading the whole weight into memory, if th LOAD" shows "Not Supported". ```shell -$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --skip-metadata --skip-architecture --skip-tokenizer --no-mmap --in-short +$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --no-mmap --estimate --in-short +-------------------------------------------------------------------------------------+ | ESTIMATE | +------------------------------------------+------------------------------------------+ @@ -1057,9 +1062,8 @@ $ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-4 +--------------------+----------+----------+----------------+------------+------------+ | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | +--------------------+----------+----------+----------------+------------+------------+ -| 1 + 0 + 0 | 1.98 GiB | 2.13 GiB | 126 + 1 | 213.97 GiB | 246.59 GiB | +| 1 + 0 + 0 | 2.97 GiB | 3.12 GiB | 126 + 1 | 214.24 GiB | 246.86 GiB | +--------------------+----------+----------+----------------+------------+------------+ - ``` #### With Adapter @@ -1067,19 +1071,19 @@ $ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-4 Use `--lora`/`--control-vector` to estimate the usage when loading a model with adapters. ```shell -$ gguf-parser --hf-repo="QuantFactory/Meta-Llama-3-8B-Instruct-GGUF" --hf-file="Meta-Llama-3-8B-Instruct.Q5_K_M.gguf" --skip-metadata --skip-architecture --skip-tokenizer --in-short -+-----------------------------------------------------------------------------------+ -| ESTIMATE | -+----------------------------------------------+------------------------------------+ -| RAM | VRAM 0 | -+--------------------+------------+------------+----------------+--------+----------+ -| LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | -+--------------------+------------+------------+----------------+--------+----------+ -| 1 + 0 + 0 | 163.62 MiB | 313.62 MiB | 32 + 1 | 1 GiB | 6.79 GiB | -+--------------------+------------+------------+----------------+--------+----------+ +$ gguf-parser --hf-repo="QuantFactory/Meta-Llama-3-8B-Instruct-GGUF" --hf-file="Meta-Llama-3-8B-Instruct.Q5_K_M.gguf" --estimate --in-short ++-------------------------------------------------------------------------------------+ +| ESTIMATE | ++----------------------------------------------+--------------------------------------+ +| RAM | VRAM 0 | ++--------------------+------------+------------+----------------+----------+----------+ +| LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | ++--------------------+------------+------------+----------------+----------+----------+ +| 1 + 0 + 0 | 210.80 MiB | 360.80 MiB | 32 + 1 | 1.25 GiB | 7.04 GiB | ++--------------------+------------+------------+----------------+----------+----------+ $ # With a LoRA adapter. -$ gguf-parser --hf-repo="QuantFactory/Meta-Llama-3-8B-Instruct-GGUF" --hf-file="Meta-Llama-3-8B-Instruct.Q5_K_M.gguf" --lora-url="https://huggingface.co/ngxson/test_gguf_lora_adapter/resolve/main/lora-Llama-3-Instruct-abliteration-LoRA-8B-f16.gguf" --skip-metadata --skip-architecture --skip-tokenizer --in-short +$ gguf-parser --hf-repo="QuantFactory/Meta-Llama-3-8B-Instruct-GGUF" --hf-file="Meta-Llama-3-8B-Instruct.Q5_K_M.gguf" --lora-url="https://huggingface.co/ngxson/test_gguf_lora_adapter/resolve/main/lora-Llama-3-Instruct-abliteration-LoRA-8B-f16.gguf" --estimate --in-short +-------------------------------------------------------------------------------------+ | ESTIMATE | +----------------------------------------------+--------------------------------------+ @@ -1087,9 +1091,8 @@ $ gguf-parser --hf-repo="QuantFactory/Meta-Llama-3-8B-Instruct-GGUF" --hf-file=" +--------------------+------------+------------+----------------+----------+----------+ | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | +--------------------+------------+------------+----------------+----------+----------+ -| 1 + 0 + 0 | 168.64 MiB | 318.64 MiB | 32 + 1 | 1.16 GiB | 6.94 GiB | +| 1 + 0 + 0 | 223.91 MiB | 373.91 MiB | 32 + 1 | 1.42 GiB | 7.20 GiB | +--------------------+------------+------------+----------------+----------+----------+ - ``` #### Get Proper Offload Layers @@ -1097,61 +1100,60 @@ $ gguf-parser --hf-repo="QuantFactory/Meta-Llama-3-8B-Instruct-GGUF" --hf-file=" Use `--gpu-layers-step` to get the proper offload layers number when the model is too large to fit into the GPUs memory. ```shell -$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --skip-metadata --skip-architecture --skip-tokenizer --gpu-layers-step=6 --in-short -+--------------------------------------------------------------------------------------+ -| ESTIMATE | -+----------------------------------------------+---------------------------------------+ -| RAM | VRAM 0 | -+--------------------+------------+------------+----------------+---------+------------+ -| LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | -+--------------------+------------+------------+----------------+---------+------------+ -| 1 + 126 + 1 | 126.59 GiB | 126.73 GiB | 0 + 0 | 0 B | 250 MiB | -+--------------------+------------+------------+----------------+---------+------------+ -| 1 + 120 + 1 | 120.64 GiB | 120.78 GiB | 6 + 0 | 6 GiB | 43.68 GiB | -+--------------------+------------+------------+----------------+---------+------------+ -| 1 + 114 + 1 | 114.64 GiB | 114.78 GiB | 12 + 0 | 12 GiB | 53.74 GiB | -+--------------------+------------+------------+----------------+---------+------------+ -| 1 + 108 + 1 | 108.64 GiB | 108.78 GiB | 18 + 0 | 18 GiB | 63.80 GiB | -+--------------------+------------+------------+----------------+---------+------------+ -| 1 + 102 + 1 | 102.64 GiB | 102.78 GiB | 24 + 0 | 24 GiB | 73.86 GiB | -+--------------------+------------+------------+----------------+---------+------------+ -| 1 + 96 + 1 | 96.64 GiB | 96.78 GiB | 30 + 0 | 30 GiB | 83.93 GiB | -+--------------------+------------+------------+----------------+---------+------------+ -| 1 + 90 + 1 | 90.64 GiB | 90.78 GiB | 36 + 0 | 36 GiB | 93.99 GiB | -+--------------------+------------+------------+----------------+---------+------------+ -| 1 + 84 + 1 | 84.64 GiB | 84.78 GiB | 42 + 0 | 42 GiB | 104.05 GiB | -+--------------------+------------+------------+----------------+---------+------------+ -| 1 + 78 + 1 | 78.64 GiB | 78.78 GiB | 48 + 0 | 48 GiB | 114.11 GiB | -+--------------------+------------+------------+----------------+---------+------------+ -| 1 + 72 + 1 | 72.64 GiB | 72.78 GiB | 54 + 0 | 54 GiB | 124.17 GiB | -+--------------------+------------+------------+----------------+---------+------------+ -| 1 + 66 + 1 | 66.64 GiB | 66.78 GiB | 60 + 0 | 60 GiB | 134.23 GiB | -+--------------------+------------+------------+----------------+---------+------------+ -| 1 + 60 + 1 | 60.64 GiB | 60.78 GiB | 66 + 0 | 66 GiB | 144.29 GiB | -+--------------------+------------+------------+----------------+---------+------------+ -| 1 + 54 + 1 | 54.64 GiB | 54.78 GiB | 72 + 0 | 72 GiB | 154.35 GiB | -+--------------------+------------+------------+----------------+---------+------------+ -| 1 + 48 + 1 | 48.64 GiB | 48.78 GiB | 78 + 0 | 78 GiB | 164.42 GiB | -+--------------------+------------+------------+----------------+---------+------------+ -| 1 + 42 + 1 | 42.64 GiB | 42.78 GiB | 84 + 0 | 84 GiB | 174.48 GiB | -+--------------------+------------+------------+----------------+---------+------------+ -| 1 + 36 + 1 | 36.64 GiB | 36.78 GiB | 90 + 0 | 90 GiB | 184.54 GiB | -+--------------------+------------+------------+----------------+---------+------------+ -| 1 + 30 + 1 | 30.64 GiB | 30.78 GiB | 96 + 0 | 96 GiB | 194.60 GiB | -+--------------------+------------+------------+----------------+---------+------------+ -| 1 + 24 + 1 | 24.64 GiB | 24.78 GiB | 102 + 0 | 102 GiB | 204.66 GiB | -+--------------------+------------+------------+----------------+---------+------------+ -| 1 + 18 + 1 | 18.64 GiB | 18.78 GiB | 108 + 0 | 108 GiB | 214.72 GiB | -+--------------------+------------+------------+----------------+---------+------------+ -| 1 + 12 + 1 | 12.64 GiB | 12.78 GiB | 114 + 0 | 114 GiB | 225.05 GiB | -+--------------------+------------+------------+----------------+---------+------------+ -| 1 + 6 + 1 | 6.64 GiB | 6.78 GiB | 120 + 0 | 120 GiB | 235.64 GiB | -+--------------------+------------+------------+----------------+---------+------------+ -| 1 + 0 + 1 | 653.08 MiB | 803.08 MiB | 126 + 0 | 126 GiB | 246.24 GiB | -+--------------------+------------+------------+----------------+---------+------------+ -| 1 + 0 + 0 | 652.53 MiB | 802.53 MiB | 126 + 1 | 126 GiB | 246.59 GiB | -+--------------------+------------+------------+----------------+---------+------------+ - +$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --gpu-layers-step=6 --estimate --in-short ++-----------------------------------------------------------------------------------------+ +| ESTIMATE | ++----------------------------------------------+------------------------------------------+ +| RAM | VRAM 0 | ++--------------------+------------+------------+----------------+------------+------------+ +| LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | ++--------------------+------------+------------+----------------+------------+------------+ +| 1 + 126 + 1 | 127.64 GiB | 127.79 GiB | 0 + 0 | 0 B | 250 MiB | ++--------------------+------------+------------+----------------+------------+------------+ +| 1 + 120 + 1 | 121.90 GiB | 122.05 GiB | 6 + 0 | 6 GiB | 44.68 GiB | ++--------------------+------------+------------+----------------+------------+------------+ +| 1 + 114 + 1 | 115.90 GiB | 116.05 GiB | 12 + 0 | 12 GiB | 54.74 GiB | ++--------------------+------------+------------+----------------+------------+------------+ +| 1 + 108 + 1 | 109.90 GiB | 110.05 GiB | 18 + 0 | 18 GiB | 64.80 GiB | ++--------------------+------------+------------+----------------+------------+------------+ +| 1 + 102 + 1 | 103.90 GiB | 104.05 GiB | 24 + 0 | 24 GiB | 74.86 GiB | ++--------------------+------------+------------+----------------+------------+------------+ +| 1 + 96 + 1 | 97.90 GiB | 98.05 GiB | 30 + 0 | 30 GiB | 84.93 GiB | ++--------------------+------------+------------+----------------+------------+------------+ +| 1 + 90 + 1 | 91.90 GiB | 92.05 GiB | 36 + 0 | 36 GiB | 94.99 GiB | ++--------------------+------------+------------+----------------+------------+------------+ +| 1 + 84 + 1 | 85.90 GiB | 86.05 GiB | 42 + 0 | 42 GiB | 105.05 GiB | ++--------------------+------------+------------+----------------+------------+------------+ +| 1 + 78 + 1 | 79.90 GiB | 80.05 GiB | 48 + 0 | 48 GiB | 115.11 GiB | ++--------------------+------------+------------+----------------+------------+------------+ +| 1 + 72 + 1 | 73.90 GiB | 74.05 GiB | 54 + 0 | 54 GiB | 125.17 GiB | ++--------------------+------------+------------+----------------+------------+------------+ +| 1 + 66 + 1 | 67.90 GiB | 68.05 GiB | 60 + 0 | 60 GiB | 135.23 GiB | ++--------------------+------------+------------+----------------+------------+------------+ +| 1 + 60 + 1 | 61.90 GiB | 62.05 GiB | 66 + 0 | 66 GiB | 145.29 GiB | ++--------------------+------------+------------+----------------+------------+------------+ +| 1 + 54 + 1 | 55.90 GiB | 56.05 GiB | 72 + 0 | 72 GiB | 155.35 GiB | ++--------------------+------------+------------+----------------+------------+------------+ +| 1 + 48 + 1 | 49.90 GiB | 50.05 GiB | 78 + 0 | 78 GiB | 165.42 GiB | ++--------------------+------------+------------+----------------+------------+------------+ +| 1 + 42 + 1 | 43.90 GiB | 44.05 GiB | 84 + 0 | 84 GiB | 175.48 GiB | ++--------------------+------------+------------+----------------+------------+------------+ +| 1 + 36 + 1 | 37.90 GiB | 38.05 GiB | 90 + 0 | 90 GiB | 185.54 GiB | ++--------------------+------------+------------+----------------+------------+------------+ +| 1 + 30 + 1 | 31.90 GiB | 32.05 GiB | 96 + 0 | 96 GiB | 195.60 GiB | ++--------------------+------------+------------+----------------+------------+------------+ +| 1 + 24 + 1 | 25.90 GiB | 26.05 GiB | 102 + 0 | 102 GiB | 205.66 GiB | ++--------------------+------------+------------+----------------+------------+------------+ +| 1 + 18 + 1 | 19.90 GiB | 20.05 GiB | 108 + 0 | 108 GiB | 215.72 GiB | ++--------------------+------------+------------+----------------+------------+------------+ +| 1 + 12 + 1 | 13.90 GiB | 14.05 GiB | 114 + 0 | 114 GiB | 226.05 GiB | ++--------------------+------------+------------+----------------+------------+------------+ +| 1 + 6 + 1 | 7.90 GiB | 8.05 GiB | 120 + 0 | 120 GiB | 236.64 GiB | ++--------------------+------------+------------+----------------+------------+------------+ +| 1 + 0 + 1 | 1.90 GiB | 2.05 GiB | 126 + 0 | 126 GiB | 246.24 GiB | ++--------------------+------------+------------+----------------+------------+------------+ +| 1 + 0 + 0 | 1.63 GiB | 1.78 GiB | 126 + 1 | 126.28 GiB | 246.86 GiB | ++--------------------+------------+------------+----------------+------------+------------+ ``` ## License diff --git a/vendor/github.com/gpustack/gguf-parser-go/cache.go b/vendor/github.com/gpustack/gguf-parser-go/cache.go index 33fd753c..cb8311ad 100644 --- a/vendor/github.com/gpustack/gguf-parser-go/cache.go +++ b/vendor/github.com/gpustack/gguf-parser-go/cache.go @@ -55,7 +55,8 @@ func (c GGUFFileCache) Get(key string, exp time.Duration) (*GGUFFile, error) { return nil, fmt.Errorf("GGUF file cache get: %w", err) } } - if len(gf.Header.MetadataKV) == 0 || len(gf.TensorInfos) == 0 { + + if len(gf.TensorInfos) == 0 { _ = os.Remove(p) return nil, ErrGGUFFileCacheCorrupted } diff --git a/vendor/github.com/gpustack/gguf-parser-go/file.go b/vendor/github.com/gpustack/gguf-parser-go/file.go index 0c6a2e3a..ae6a2b08 100644 --- a/vendor/github.com/gpustack/gguf-parser-go/file.go +++ b/vendor/github.com/gpustack/gguf-parser-go/file.go @@ -419,6 +419,10 @@ func parseGGUFFile(fs []_GGUFFileReadSeeker, o _GGUFReadOptions) (_ *GGUFFile, e // Types for GGUF hierarchical tensors. type ( + // GGUFTensorInfoFilter is a filter to filter out if the given tensor name matches. + // Return true if the name matches, and false otherwise. + GGUFTensorInfoFilter func(name string) bool + // IGGUFTensorInfos is an interface for GGUF tensor infos, // which includes basic operations. IGGUFTensorInfos interface { @@ -435,9 +439,9 @@ type ( // and the number of names found. Index(names []string) (infos map[string]GGUFTensorInfo, found int) // Elements returns the number of elements(parameters). - Elements() uint64 + Elements(filter ...GGUFTensorInfoFilter) uint64 // Bytes returns the number of bytes. - Bytes() uint64 + Bytes(filter ...GGUFTensorInfoFilter) uint64 // Count returns the number of tensors. Count() uint64 } @@ -877,11 +881,17 @@ func (ti GGUFTensorInfo) Index(names []string) (infos map[string]GGUFTensorInfo, // Elements returns the number of elements of the GGUFTensorInfo, // which is inspired by // https://github.com/ggerganov/ggml/blob/a10a8b880c059b3b29356eb9a9f8df72f03cdb6a/src/ggml.c#L2597-L2601. -func (ti GGUFTensorInfo) Elements() uint64 { +func (ti GGUFTensorInfo) Elements(filter ...GGUFTensorInfoFilter) uint64 { if ti.NDimensions == 0 { return 0 } + for i := range filter { + if filter[i] != nil && !filter[i](ti.Name) { + return 0 + } + } + ret := uint64(1) for i := uint32(0); i < ti.NDimensions; i++ { ret *= ti.Dimensions[i] @@ -892,7 +902,7 @@ func (ti GGUFTensorInfo) Elements() uint64 { // Bytes returns the number of bytes of the GGUFTensorInfo, // which is inspired by // https://github.com/ggerganov/ggml/blob/a10a8b880c059b3b29356eb9a9f8df72f03cdb6a/src/ggml.c#L2609-L2626. -func (ti GGUFTensorInfo) Bytes() uint64 { +func (ti GGUFTensorInfo) Bytes(filter ...GGUFTensorInfoFilter) uint64 { if ti.NDimensions == 0 { return 0 } @@ -902,6 +912,12 @@ func (ti GGUFTensorInfo) Bytes() uint64 { panic(fmt.Errorf("invalid type: %v", ti.Type)) } + for i := range filter { + if filter[i] != nil && !filter[i](ti.Name) { + return 0 + } + } + // https://github.com/ggerganov/ggml/blob/a10a8b880c059b3b29356eb9a9f8df72f03cdb6a/src/ggml.c#L3210-L3214 nb := make([]uint64, 0, ti.NDimensions) { @@ -1061,7 +1077,7 @@ func (tis GGUFTensorInfos) layers() GGUFLayerTensorInfos { } l := pm[p].(*GGUFNamedTensorInfos) l.GGUFLayerTensorInfos = append(l.GGUFLayerTensorInfos, tis[i]) - case (ps[0] == "v" || ps[0] == "t") && ps[1] == "blk": + case (ps[0] == "v" || ps[0] == "t" || ps[0] == "a") && ps[1] == "blk": // LLaMACpp CLIP. p := ps[0] if _, ok := pm[p]; !ok { @@ -1282,19 +1298,19 @@ func (ltis GGUFLayerTensorInfos) Index(names []string) (infos map[string]GGUFTen } // Elements returns the number of elements of the GGUFLayerTensorInfos. -func (ltis GGUFLayerTensorInfos) Elements() uint64 { +func (ltis GGUFLayerTensorInfos) Elements(filter ...GGUFTensorInfoFilter) uint64 { var ret uint64 for i := range ltis { - ret += ltis[i].Elements() + ret += ltis[i].Elements(filter...) } return ret } // Bytes returns the number of bytes of the GGUFLayerTensorInfos. -func (ltis GGUFLayerTensorInfos) Bytes() uint64 { +func (ltis GGUFLayerTensorInfos) Bytes(filter ...GGUFTensorInfoFilter) uint64 { var ret uint64 for i := range ltis { - ret += ltis[i].Bytes() + ret += ltis[i].Bytes(filter...) } return ret } @@ -1697,7 +1713,7 @@ func (rd _GGUFTensorInfoReader) Read() (ti GGUFTensorInfo, err error) { } ti.Type = GGMLType(v) if ti.Type >= _GGMLTypeCount { - return ti, fmt.Errorf("invalid type: %v", ti.Type) + return ti, fmt.Errorf("%v: This quantized type is currently unsupported", ti.Type) } } diff --git a/vendor/github.com/gpustack/gguf-parser-go/file_architecture.go b/vendor/github.com/gpustack/gguf-parser-go/file_architecture.go index f545d0ee..ad2f1ca9 100644 --- a/vendor/github.com/gpustack/gguf-parser-go/file_architecture.go +++ b/vendor/github.com/gpustack/gguf-parser-go/file_architecture.go @@ -2,6 +2,7 @@ package gguf_parser import ( "regexp" + "slices" "strings" ) @@ -41,6 +42,8 @@ type ( ExpertCount uint32 `json:"expertCount,omitempty"` // ExpertUsedCount(n_expert_used) is the number of experts used during each token evaluation in MoE models. ExpertUsedCount uint32 `json:"expertUsedCount,omitempty"` + // ExpertSharedCount(n_expert_shared) is the number of shared experts in MoE models. + ExpertSharedCount uint32 `json:"expertSharedCount,omitempty"` // AttentionHeadCount(n_head) is the number of attention heads. AttentionHeadCount uint64 `json:"attentionHeadCount,omitempty"` // AttentionHeadCountKV(n_head_kv) is the number of attention heads per group used in Grouped-Query-Attention. @@ -48,6 +51,14 @@ type ( // If not provided or equal to AttentionHeadCount, // the model does not use Grouped-Query-Attention. AttentionHeadCountKV uint64 `json:"attentionHeadCountKV,omitempty"` + // AttentionSlidingWindowPattern is the pattern used in the sliding window attention. + // + // 0 means all layers are Sliding Window Attention. + // 1 means all layers are none Sliding Window Attention. + // N means every Nth layer is none Sliding Window Attention. + AttentionSlidingWindowPattern uint32 `json:"attentionSlidingWindowPattern,omitempty"` + // AttentionSlidingWindow is the size of the sliding window used in the attention layer. + AttentionSlidingWindow uint64 `json:"attentionSlidingWindow,omitempty"` // AttentionMaxALiBIBias is the maximum bias to use for ALiBI. AttentionMaxALiBIBias float32 `json:"attentionMaxALiBIBias,omitempty"` // AttentionClampKQV describes a value `C`, @@ -58,20 +69,46 @@ type ( // AttentionLayerNormRMSEpsilon is the epsilon value used in the RMSNorm(root Mean Square Layer Normalization), // which is a simplification of the original LayerNorm. AttentionLayerNormRMSEpsilon float32 `json:"attentionLayerNormRMSEpsilon,omitempty"` + // AttentionQueryLORARank is the LORA rank of the query matrix. + // + // Zero means no LORA. + AttentionQueryLORARank uint32 `json:"attentionQueryLORARank,omitempty"` + // AttentionKeyValueLORARank is the LORA rank of the key/value matrix. + // + // Zero means no LORA. + AttentionKeyValueLORARank uint32 `json:"attentionKeyValueLORARank,omitempty"` // AttentionKeyLength(n_embd_head_k) is the size of a key head. // // Defaults to `EmbeddingLength / AttentionHeadCount`. AttentionKeyLength uint32 `json:"attentionKeyLength,omitempty"` + // AttentionKeyLengthMLA(n_embd_head_k_mla) is the size of a key head in MLA(Multi-Layer Attention). + // + // Zero means no MLA. + AttentionKeyLengthMLA uint32 `json:"attentionKeyLengthMLA,omitempty"` // AttentionValueLength(n_embd_head_v) is the size of a value head. // // Defaults to `EmbeddingLength / AttentionHeadCount`. AttentionValueLength uint32 `json:"attentionValueLength,omitempty"` + // AttentionValueLengthMLA(n_embd_head_v_mla) is the size of a value head in MLA(Multi-Layer Attention). + // + // Zero means no MLA. + AttentionValueLengthMLA uint32 `json:"attentionValueLengthMLA,omitempty"` // AttentionCausal is true if the attention is causal. AttentionCausal bool `json:"attentionCausal,omitempty"` + // AttentionRecurrent is true if the attention is recurrent. + // + // Used in Mamba, RWKV, and similar architectures. + AttentionRecurrent bool `json:"attentionRecurrent,omitempty"` + // AttentionHybrid is true if the attention is hybrid (causal (self-attention) + recurrent). + // + // Used in Jamba, Falcon-H1, and similar architectures. + AttentionHybrid bool `json:"attentionHybrid,omitempty"` // RoPEDimensionCount is the number of dimensions in the RoPE(Rotary Positional Encoding). RoPEDimensionCount uint64 `json:"ropeDimensionCount,omitempty"` // RoPEFrequencyBase is the base frequency of the RoPE. RoPEFrequencyBase float32 `json:"ropeFrequencyBase,omitempty"` + // RoPEFrequencyScale is the scale frequency of the RoPE. + RoPEFrequencyScale float32 `json:"ropeFrequencyScale,omitempty"` // RoPEFrequencyScale is the frequency scale of the RoPE. RoPEScalingType string `json:"ropeScalingType,omitempty"` // RoPEScalingFactor is the scaling factor of the RoPE. @@ -80,14 +117,26 @@ type ( RoPEScalingOriginalContextLength uint64 `json:"ropeScalingOriginalContextLength,omitempty"` // RoPEScalingFinetuned is true if the RoPE scaling is fine-tuned. RoPEScalingFinetuned bool `json:"ropeScalingFinetuned,omitempty"` - // SSMConvolutionKernel is the size of the convolution kernel used in the SSM(Selective State Space Model). + // SSMConvolutionKernel is the size of the convolution kernel used in the Selective State Space Model (SSM) and similar architectures. SSMConvolutionKernel uint32 `json:"ssmConvolutionKernel,omitempty"` - // SSMInnerSize is the embedding size of the state in SSM. + // SSMInnerSize is the embedding size of the state in SSM and similar architectures. SSMInnerSize uint32 `json:"ssmInnerSize,omitempty"` - // SSMStateSize is the size of the recurrent state in SSM. + // SSMStateSize is the size of the recurrent state in SSM and similar architectures. SSMStateSize uint32 `json:"ssmStateSize,omitempty"` - // SSMTimeStepRank is the rank of the time steps in SSM. + // SSMTimeStepRank is the rank of the time steps in SSM and similar architectures. SSMTimeStepRank uint32 `json:"ssmTimeStepRank,omitempty"` + // SSMGroupCount is the number of groups in the SSM and similar architectures. + SSMGroupCount uint32 `json:"ssmGroupCount,omitempty"` + // WKVHeadSize is the size of the head in RWKV and similar architectures. + RWKVHeadSize uint32 `json:"rwkvHeadSize,omitempty"` + // RWKVRescaleEveryNLayers is the number of layers after which the rescaling is applied in RWKV and similar architectures. + RWKVRescaleEveryNLayers uint32 `json:"rwkvRescaleEveryNLayers,omitempty"` + // RWKVTimeMixExtraDimension indicates whether the RWKV architecture has an extra dimension for time mixing. + RWKVTimeMixExtraDimension uint32 `json:"rwkvTimeMixExtraDimension,omitempty"` + // RWKVTimeDecayExtraDimension indicates whether the RWKV architecture has an extra dimension for time decay. + RWKVTimeDecayExtraDimension uint32 `json:"rwkvTimeDecayExtraDimension,omitempty"` + // TokenShiftCount is the number of token shifts used in RWKV and similar architectures. + RWKVTokenShiftCount uint32 `json:"rwkvTokenShiftCount,omitempty"` // VocabularyLength is the size of the vocabulary. // // VocabularyLength is the same as the tokenizer's token size. @@ -95,13 +144,6 @@ type ( /* Appendix */ - // EmbeddingGGQA is the GQA of the embedding layer. - EmbeddingGQA uint64 `json:"embeddingGQA,omitempty"` - // EmbeddingKeyGQA is the number of key GQA in the embedding layer. - EmbeddingKeyGQA uint64 `json:"embeddingKeyGQA,omitempty"` - // EmbeddingValueGQA is the number of value GQA in the embedding layer. - EmbeddingValueGQA uint64 `json:"embeddingValueGQA,omitempty"` - // ClipProjectorType is the type of the projector used in the clip model. // // Only used when Architecture is "clip". @@ -109,47 +151,123 @@ type ( // ClipHasLLaVAProjector indicates whether the clip model has LLaVA projector or not. // // Only used when Architecture is "clip". + // + // Deprecated: use ClipProjectorType instead. ClipHasLLaVAProjector bool `json:"clipHasLLaVAProjector,omitempty"` // ClipHasMiniCPMVProjector indicates whether the clip model has MiniCPMV projector or not. // // Only used when Architecture is "clip". + // + // Deprecated: use ClipProjectorType instead. ClipHasMiniCPMVProjector bool `json:"clipHasMiniCPMVProject,omitempty"` // ClipMiniCPMVVersion is the version of the MiniCPMV projector. // - // Only used when Architecture is "clip" and ClipHasMiniCPMVProjector is true. + // Only used when Architecture is "clip". ClipMiniCPMVVersion int32 `json:"clipMiniCPMVVersion,omitempty"` + // ClipMiniCPMVQueryNum is the number of queries used in the MiniCPMV projector. + // + // Only used when Architecture is "clip". + ClipMiniCPMVQueryNum int32 `json:"clipMiniCPMVQueryNum,omitempty"` // ClipHasGLMProjector indicates whether the clip model has GLM projector or not. // // Only used when Architecture is "clip". + // + // Deprecated: use ClipProjectorType instead. ClipHasGLMProjector bool `json:"clipHasGLMProjector,omitempty"` // ClipHasQwen2VLMerger indicates whether the clip model has Qwen2VL merger or not. // // Only used when Architecture is "clip". - ClipHasQwen2VLMerger bool `json:"clipHasQwen2VLMerger,omitempty"` - // ClipHasTextEncoder indicates whether the clip model has text encoder or not. // - // Only used when Architecture is "clip". - ClipHasTextEncoder bool `json:"clipHasTextEncoder,omitempty"` + // Deprecated: use ClipProjectorType instead. + ClipHasQwen2VLMerger bool `json:"clipHasQwen2VLMerger,omitempty"` // ClipHasVisionEncoder indicates whether the clip model has vision encoder or not. // // Only used when Architecture is "clip". ClipHasVisionEncoder bool `json:"clipHasVisionEncoder,omitempty"` - // ClipVisionImageSize indicates the image size of vision encoder. + // ClipVisionEmbeddingLength indicates the embedding length of vision encoder. // // Only used when Architecture is "clip" and ClipHasVisionEncoder is true. - ClipVisionImageSize uint32 `json:"clipVisionImageSize,omitempty"` - // ClipVisionPatchSize indicates the patch size of vision encoder. + ClipVisionEmbeddingLength uint64 `json:"clipVisionEmbeddingLength,omitempty"` + // ClipVisionBlockCount indicates the number of blocks in the vision encoder. // // Only used when Architecture is "clip" and ClipHasVisionEncoder is true. - ClipVisionPatchSize uint32 `json:"clipVisionPatchSize,omitempty"` + ClipVisionBlockCount uint64 `json:"clipVisionBlockCount,omitempty"` + // ClipVisionFeedForwardLength indicates the feed-forward length of the vision encoder. + // + // Only used when Architecture is "clip" and ClipHasVisionEncoder is true. + ClipVisionFeedForwardLength []uint64 `json:"clipVisionFeedForwardLength,omitempty"` + // ClipVisionAttentionHeadCount indicates the number of attention heads in the vision encoder. + // + // Only used when Architecture is "clip" and ClipHasVisionEncoder is true. + ClipVisionAttentionHeadCount uint64 `json:"clipVisionAttentionHeadCount,omitempty"` + // ClipVisionAttentionLayerNormRMSEpsilon indicates the epsilon value used in the RMSNorm of the vision encoder. + // + // Only used when Architecture is "clip" and ClipHasVisionEncoder is true. + ClipVisionAttentionLayerNormRMSEpsilon float32 `json:"clipVisionAttentionLayerNormRMSEpsilon,omitempty"` // ClipVisionProjectionDim indicates the projection dimension of vision encoder. // // Only used when Architecture is "clip" and ClipHasVisionEncoder is true. ClipVisionProjectionDim uint32 `json:"clipVisionProjectionDim,omitempty"` + // ClipVisionProjectorScaleFactor is the scale factor of the projector. + // + // Only used when Architecture is "clip" and ClipHasVisionEncoder is true. + ClipVisionProjectorScaleFactor uint32 `json:"clipVisionProjectorScaleFactor,omitempty"` + // ClipVisionImageSize indicates the image size of vision encoder. + // + // Only used when Architecture is "clip" and ClipHasVisionEncoder is true. + ClipVisionImageSize uint32 `json:"clipVisionImageSize,omitempty"` + // ClipVisionPatchSize indicates the patch size of vision encoder. + // + // Only used when Architecture is "clip" and ClipHasVisionEncoder is true. + ClipVisionPatchSize uint32 `json:"clipVisionPatchSize,omitempty"` // ClipVisionMMPatchMergeType indicates the merge type of the vision encoder. // // Only used when Architecture is "clip" and ClipHasVisionEncoder is true. ClipVisionMMPatchMergeType string `json:"clipVisionMMPatchMergeType,omitempty"` + // ClipVisionSpatialMergeSize is the spatial merge size of the vision encoder. + // + // Only used when Architecture is "clip" and ClipHasVisionEncoder is true. + ClipVisionSpatialMergeSize uint32 `json:"clipVisionSpatialMergeSize,omitempty"` + // ClipVisionWindowAttentionPattern is the Window Attention pattern used in the vision encoder. + // + // Only used when Architecture is "clip" and ClipHasVisionEncoder is true. + ClipVisionWindowAttentionPattern uint32 `json:"clipVisionWindowAttentionPattern,omitempty"` + // ClipHasAudioEncoder indicates whether the clip model has audio encoder or not. + // + // Only used when Architecture is "clip". + ClipHasAudioEncoder bool `json:"clipHasAudioEncoder,omitempty"` + // ClipAudioEmbeddingLength indicates the embedding length of audio encoder. + // + // Only used when Architecture is "clip" and ClipHasAudioEncoder is true. + ClipAudioEmbeddingLength uint64 `json:"clipAudioEmbeddingLength,omitempty"` + // ClipAudioBlockCount indicates the number of blocks in the audio encoder. + // + // Only used when Architecture is "clip" and ClipHasAudioEncoder is true. + ClipAudioBlockCount uint64 `json:"clipAudioBlockCount,omitempty"` + // ClipAudioFeedForwardLength indicates the feed-forward length of the audio encoder. + // + // Only used when Architecture is "clip" and ClipHasAudioEncoder is true. + ClipAudioFeedForwardLength []uint64 `json:"clipAudioFeedForwardLength,omitempty"` + // ClipAudioAttentionHeadCount indicates the number of attention heads in the audio encoder. + // + // Only used when Architecture is "clip" and ClipHasAudioEncoder is true. + ClipAudioAttentionHeadCount uint64 `json:"clipAudioAttentionHeadCount,omitempty"` + // ClipAudioAttentionLayerNormRMSEpsilon indicates the epsilon value used in the RMSNorm of the audio encoder. + // + // Only used when Architecture is "clip" and ClipHasAudioEncoder is true. + ClipAudioAttentionLayerNormRMSEpsilon float32 `json:"clipAudioAttentionLayerNormRMSEpsilon,omitempty"` + // ClipAudioProjectionDim indicates the projection dimension of audio encoder. + // + // Only used when Architecture is "clip" and ClipHasAudioEncoder is true. + ClipAudioProjectionDim uint32 `json:"clipAudioProjectionDim,omitempty"` + // ClipAudioProjectorStackFactor is the scale factor of the projector. + // + // Only used when Architecture is "clip" and ClipHasAudioEncoder is true. + ClipAudioProjectorStackFactor uint32 `json:"clipAudioProjectorStackFactor,omitempty"` + // ClipAudioNumMelBins is the number of mel bins used in the audio encoder. + // + // Only used when Architecture is "clip" and ClipHasAudioEncoder is true. + ClipAudioNumMelBins uint32 `json:"clipAudioNumMelBins,omitempty"` // AdapterType is the type of the adapter. // @@ -236,11 +354,11 @@ func (gaa GGUFArchitectureDiffusionAutoencoder) String() string { // Architecture returns the architecture metadata of the GGUF file. func (gf *GGUFFile) Architecture() (ga GGUFArchitecture) { - if gf.TensorInfos.Match(regexp.MustCompile(`^model\.diffusion_model\..*`)) || - gf.TensorInfos.Match(regexp.MustCompile(`^double_blocks\..*`)) { - return gf.diffuserArchitecture() + for _, re := range _GGUFPotentialDiffusionArchitectureTensorsRegexes { + if gf.TensorInfos.Match(re) { + return gf.diffuserArchitecture() + } } - var ( generalTypeKey = "general.type" generalArchitectureKey = "general.architecture" @@ -274,6 +392,8 @@ func (gf *GGUFFile) Architecture() (ga GGUFArchitecture) { return gf.adapterArchitecture(arch) case typ == "adapter": return gf.adapterArchitecture(arch) + case typ == "imatrix": + return gf.imatrixArchitecture(arch) } return gf.transformerArchitecture(arch) } @@ -282,11 +402,16 @@ func (gf *GGUFFile) diffuserArchitecture() (ga GGUFArchitecture) { const ( // Diffusion - sdKey = "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.attn2.to_v.weight" // SD 1.x/2.x - sdXlKey = "model.diffusion_model.output_blocks.5.1.transformer_blocks.1.attn1.to_v.weight" // SD XL - sdXlRefinerKey = "model.diffusion_model.output_blocks.8.1.transformer_blocks.1.attn1.to_v.weight" // SD XL Refiner - sd3Key = "model.diffusion_model.joint_blocks.23.x_block.attn.proj.weight" // SD 3.x - sdInPaintFeatureKey = "model.diffusion_model.input_blocks.0.0.weight" // SD in-paint feature + sdKey = "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.attn2.to_v.weight" // SD 1.x/2.x + sdKey2 = "output_blocks.11.1.transformer_blocks.0.attn2.to_v.weight" + sdXlKey = "model.diffusion_model.output_blocks.5.1.transformer_blocks.1.attn1.to_v.weight" // SD XL + sdXlKey2 = "output_blocks.5.1.transformer_blocks.1.attn1.to_v.weight" + sdXlRefinerKey = "model.diffusion_model.output_blocks.8.1.transformer_blocks.1.attn1.to_v.weight" // SD XL Refiner + sdXlRefinerKey2 = "output_blocks.8.1.transformer_blocks.1.attn1.to_v.weight" + sd3Key = "model.diffusion_model.joint_blocks.23.x_block.attn.proj.weight" // SD 3.x + sd3Key2 = "joint_blocks.23.x_block.attn.proj.weight" + sdInPaintFeatureKey = "model.diffusion_model.input_blocks.0.0.weight" // SD in-paint feature + sdInPaintFeatureKey2 = "input_blocks.0.0.weight" fluxKey = "model.diffusion_model.double_blocks.0.txt_attn.proj.weight" // FLUX.1 fluxKey2 = "double_blocks.0.txt_attn.proj.weight" @@ -295,19 +420,28 @@ func (gf *GGUFFile) diffuserArchitecture() (ga GGUFArchitecture) { // Conditioner - openAiClipVitL14Key = "cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.k_proj.weight" // OpenAI CLIP ViT-L/14 - openClipVitH14Key = "cond_stage_model.transformer.text_model.encoder.layers.22.self_attn.k_proj.weight" // OpenCLIP ViT-H/14 - openClipVitG14Key = "cond_stage_model.1.transformer.text_model.encoder.layers.31.self_attn.k_proj.weight" // OpenCLIP ViT-G/14 - t5xxlKey = "cond_stage_model.1.transformer.encoder.block.23.layer.0.SelfAttention.k.weight" // Google T5-xxl - t5xxlKey2 = "cond_stage_model.2.transformer.encoder.block.23.layer.0.SelfAttention.k.weight" + openAiClipVitL14Key = "cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.k_proj.weight" // OpenAI CLIP ViT-L/14 + openAiClipVitL14Key2 = "text_model.encoder.layers.11.self_attn.k_proj.weight" + openClipVitH14Key = "cond_stage_model.transformer.text_model.encoder.layers.22.self_attn.k_proj.weight" // OpenCLIP ViT-H/14 + openClipVitH14Key2 = "text_model.encoder.layers.22.self_attn.k_proj.weight" + openClipVitG14Key = "cond_stage_model.1.transformer.text_model.encoder.layers.31.self_attn.k_proj.weight" // OpenCLIP ViT-G/14 + openClipVitG14Key2 = "text_model.encoder.layers.31.self_attn.k_proj.weight" + t5xxlKey = "cond_stage_model.1.transformer.encoder.block.23.layer.0.SelfAttention.k.weight" // Google T5-xxl + t5xxlKey2 = "cond_stage_model.2.transformer.encoder.block.23.layer.0.SelfAttention.k.weight" + t5xxlKey3 = "encoder.block.23.layer.0.SelfAttention.k.weight" ) tis, _ := gf.TensorInfos.Index([]string{ sdKey, + sdKey2, sdXlKey, + sdXlKey2, sdXlRefinerKey, + sdXlRefinerKey2, sd3Key, + sd3Key2, sdInPaintFeatureKey, + sdInPaintFeatureKey2, fluxKey, fluxKey2, @@ -315,10 +449,14 @@ func (gf *GGUFFile) diffuserArchitecture() (ga GGUFArchitecture) { fluxFillFeatureKey2, openAiClipVitL14Key, + openAiClipVitL14Key2, openClipVitH14Key, + openClipVitH14Key2, openClipVitG14Key, + openClipVitG14Key2, t5xxlKey, t5xxlKey2, + t5xxlKey3, }) ga.Type = "model" @@ -332,6 +470,14 @@ func (gf *GGUFFile) diffuserArchitecture() (ga GGUFArchitecture) { if ti, ok := tis[sdInPaintFeatureKey]; ok && ti.Dimensions[2] == 9 { ga.DiffusionArchitecture += " InPaint" } + } else if _, ok := tis[sdKey2]; ok { + ga.DiffusionArchitecture = "Stable Diffusion 1.x" + if ti.Dimensions[0] == 1024 { + ga.DiffusionArchitecture = "Stable Diffusion 2.x" + } + if ti, ok := tis[sdInPaintFeatureKey2]; ok && ti.Dimensions[2] == 9 { + ga.DiffusionArchitecture += " InPaint" + } } else if _, ok := tis[sdXlKey]; ok { ga.DiffusionArchitecture = "Stable Diffusion XL" if _, ok = tis[sdXlRefinerKey]; ok { @@ -340,9 +486,20 @@ func (gf *GGUFFile) diffuserArchitecture() (ga GGUFArchitecture) { if ti, ok := tis[sdInPaintFeatureKey]; ok && ti.Dimensions[2] == 9 { ga.DiffusionArchitecture += " InPaint" } + } else if _, ok := tis[sdXlKey2]; ok { + ga.DiffusionArchitecture = "Stable Diffusion XL" + if _, ok = tis[sdXlRefinerKey2]; ok { + ga.DiffusionArchitecture = "Stable Diffusion XL Refiner" + } + if ti, ok := tis[sdInPaintFeatureKey2]; ok && ti.Dimensions[2] == 9 { + ga.DiffusionArchitecture += " InPaint" + } } else if _, ok := tis[sd3Key]; ok { ga.DiffusionArchitecture = "Stable Diffusion 3.x" ga.DiffusionTransformer = true + } else if _, ok := tis[sd3Key2]; ok { + ga.DiffusionArchitecture = "Stable Diffusion 3.x" + ga.DiffusionTransformer = true } if _, ok := tis[fluxKey]; ok { ga.DiffusionArchitecture = "FLUX.1" @@ -370,12 +527,29 @@ func (gf *GGUFFile) diffuserArchitecture() (ga GGUFArchitecture) { } } ga.DiffusionConditioners = append(ga.DiffusionConditioners, cond) + } else if ti, ok := tis[openAiClipVitL14Key2]; ok { + cond := GGUFArchitectureDiffusionConditioner{ + Architecture: "OpenAI CLIP ViT-L/14", + FileType: ti.GetFileType(), + } + if ti, ok = tis[openClipVitH14Key2]; ok { + cond = GGUFArchitectureDiffusionConditioner{ + Architecture: "OpenCLIP ViT-H/14", + FileType: ti.GetFileType(), + } + } + ga.DiffusionConditioners = append(ga.DiffusionConditioners, cond) } if ti, ok := tis[openClipVitG14Key]; ok { ga.DiffusionConditioners = append(ga.DiffusionConditioners, GGUFArchitectureDiffusionConditioner{ Architecture: "OpenCLIP ViT-G/14", FileType: ti.GetFileType(), }) + } else if ti, ok = tis[openClipVitG14Key2]; ok { + ga.DiffusionConditioners = append(ga.DiffusionConditioners, GGUFArchitectureDiffusionConditioner{ + Architecture: "OpenCLIP ViT-G/14", + FileType: ti.GetFileType(), + }) } if ti, ok := tis[t5xxlKey]; ok { ga.DiffusionConditioners = append(ga.DiffusionConditioners, GGUFArchitectureDiffusionConditioner{ @@ -387,12 +561,23 @@ func (gf *GGUFFile) diffuserArchitecture() (ga GGUFArchitecture) { Architecture: "Google T5-xxl", FileType: ti.GetFileType(), }) + } else if ti, ok = tis[t5xxlKey3]; ok { + ga.DiffusionConditioners = append(ga.DiffusionConditioners, GGUFArchitectureDiffusionConditioner{ + Architecture: "Google T5-xxl", + FileType: ti.GetFileType(), + }) } - if tis := gf.TensorInfos.Search(regexp.MustCompile(`^first_stage_model\..*`)); len(tis) != 0 { - ga.DiffusionAutoencoder = &GGUFArchitectureDiffusionAutoencoder{ - Architecture: ga.DiffusionArchitecture + " VAE", - FileType: GGUFTensorInfos(tis).GetFileType(), + for _, re := range []*regexp.Regexp{ + regexp.MustCompile(`^first_stage_model\..*`), + regexp.MustCompile(`^decoder\.conv_in\..*`), + } { + if tis := gf.TensorInfos.Search(re); len(tis) != 0 { + ga.DiffusionAutoencoder = &GGUFArchitectureDiffusionAutoencoder{ + Architecture: ga.DiffusionArchitecture + " VAE", + FileType: GGUFTensorInfos(tis).GetFileType(), + } + break } } @@ -401,30 +586,37 @@ func (gf *GGUFFile) diffuserArchitecture() (ga GGUFArchitecture) { func (gf *GGUFFile) clipArchitecture() (ga GGUFArchitecture) { const ( - projectorTypeKey = "clip.projector_type" - hasLLaVAProjectorKey = "clip.has_llava_projector" - hasMiniCPMVProjector = "clip.has_minicpmv_projector" - miniCPMVVersionKey = "clip.minicpmv_version" - hasGLMProjectorKey = "clip.has_glm_projector" - hasQwen2VLMergerKey = "clip.has_qwen2vl_merger" - hasTextEncoderKey = "clip.has_text_encoder" - hasVisionEncoderKey = "clip.has_vision_encoder" - visionImageSizeKey = "clip.vision.image_size" - visionPatchSizeKey = "clip.vision.patch_size" - visionProjectionDim = "clip.vision.projection_dim" - visionMMPatchMergeType = "clip.vision.mm_patch_merge_type" - - textEmbeddingLengthKey = "clip.text.embedding_length" - textBlockCountKey = "clip.text.block_count" - textFeedForwardLengthKey = "clip.text.feed_forward_length" - textAttentionHeadCountKey = "clip.text.attention.head_count" - textAttentionLayerNormRMSEpsilonKey = "clip.text.attention.layer_norm_epsilon" - + projectorTypeKey = "clip.projector_type" + hasLLaVAProjectorKey = "clip.has_llava_projector" + hasMiniCPMVProjector = "clip.has_minicpmv_projector" + miniCPMVVersionKey = "clip.minicpmv_version" + miniCPMVQueryNumKey = "clip.minicpmv_query_num" + hasGLMProjectorKey = "clip.has_glm_projector" + hasQwen2VLMergerKey = "clip.has_qwen2vl_merger" + + hasVisionEncoderKey = "clip.has_vision_encoder" visionEmbeddingLengthKey = "clip.vision.embedding_length" visionBlockCountKey = "clip.vision.block_count" visionFeedForwardLengthKey = "clip.vision.feed_forward_length" visionAttentionHeadCountKey = "clip.vision.attention.head_count" visionAttentionLayerNormRMSEpsilonKey = "clip.vision.attention.layer_norm_epsilon" + visionProjectionDimKey = "clip.vision.projection_dim" + visionProjectorScaleFactorKey = "clip.vision.projector.scale_factor" + visionImageSizeKey = "clip.vision.image_size" + visionPatchSizeKey = "clip.vision.patch_size" + visionMMPatchMergeTypeKey = "clip.vision.mm_patch_merge_type" + visioSpatialMergeSizeKey = "clip.vision.spatial_merge_size" + visionWindowAttentionPatternKey = "clip.vision.n_wa_pattern" + + hasAudioEncoderKey = "clip.has_audio_encoder" + audioEmbeddingLengthKey = "clip.audio.embedding_length" + audioBlockCountKey = "clip.audio.block_count" + audioFeedForwardLengthKey = "clip.audio.feed_forward_length" + audioAttentionHeadCountKey = "clip.audio.attention.head_count" + audioAttentionLayerNormRMSEpsilonKey = "clip.audio.attention.layer_norm_epsilon" + audioProjectionDimKey = "clip.audio.projection_dim" + audioProjectorStackFactorKey = "clip.audio.projector.stack_factor" + audioNumMelBinsKey = "clip.audio.num_mel_bins" ) ga.Type = "projector" @@ -435,24 +627,33 @@ func (gf *GGUFFile) clipArchitecture() (ga GGUFArchitecture) { hasLLaVAProjectorKey, hasMiniCPMVProjector, miniCPMVVersionKey, + miniCPMVQueryNumKey, hasGLMProjectorKey, hasQwen2VLMergerKey, - hasTextEncoderKey, + // Vision hasVisionEncoderKey, - visionImageSizeKey, - visionPatchSizeKey, - visionProjectionDim, - visionMMPatchMergeType, - textEmbeddingLengthKey, - textBlockCountKey, - textFeedForwardLengthKey, - textAttentionHeadCountKey, - textAttentionLayerNormRMSEpsilonKey, visionEmbeddingLengthKey, visionBlockCountKey, visionFeedForwardLengthKey, visionAttentionHeadCountKey, visionAttentionLayerNormRMSEpsilonKey, + visionProjectionDimKey, + visionProjectorScaleFactorKey, + visionImageSizeKey, + visionPatchSizeKey, + visionMMPatchMergeTypeKey, + visioSpatialMergeSizeKey, + visionWindowAttentionPatternKey, + // Audio + hasAudioEncoderKey, + audioEmbeddingLengthKey, + audioBlockCountKey, + audioFeedForwardLengthKey, + audioAttentionHeadCountKey, + audioAttentionLayerNormRMSEpsilonKey, + audioProjectionDimKey, + audioProjectorStackFactorKey, + audioNumMelBinsKey, }) if v, ok := m[projectorTypeKey]; ok { @@ -469,96 +670,109 @@ func (gf *GGUFFile) clipArchitecture() (ga GGUFArchitecture) { if v, ok := m[miniCPMVVersionKey]; ok { ga.ClipMiniCPMVVersion = ValueNumeric[int32](v) } + if v, ok := m[miniCPMVQueryNumKey]; ok { + ga.ClipMiniCPMVQueryNum = ValueNumeric[int32](v) + } if v, ok := m[hasGLMProjectorKey]; ok { ga.ClipHasGLMProjector = v.ValueBool() } if v, ok := m[hasQwen2VLMergerKey]; ok { ga.ClipHasQwen2VLMerger = v.ValueBool() } - if v, ok := m[hasTextEncoderKey]; ok { - ga.ClipHasTextEncoder = v.ValueBool() - } + // Vision if v, ok := m[hasVisionEncoderKey]; ok { ga.ClipHasVisionEncoder = v.ValueBool() } + if v, ok := m[visionEmbeddingLengthKey]; ok { + ga.ClipVisionEmbeddingLength = ValueNumeric[uint64](v) + } + if v, ok := m[visionBlockCountKey]; ok { + ga.ClipVisionBlockCount = ValueNumeric[uint64](v) + } + if v, ok := m[visionFeedForwardLengthKey]; ok { + if v.ValueType == GGUFMetadataValueTypeArray { + ga.ClipVisionFeedForwardLength = ValuesNumeric[uint64](v.ValueArray()) + } else { + vx := ValueNumeric[uint64](v) + ga.ClipVisionFeedForwardLength = make([]uint64, ga.ClipVisionBlockCount) + for i := range ga.ClipVisionFeedForwardLength { + ga.ClipVisionFeedForwardLength[i] = vx + } + } + } + if v, ok := m[visionAttentionHeadCountKey]; ok { + ga.ClipVisionAttentionHeadCount = ValueNumeric[uint64](v) + } + if v, ok := m[visionAttentionLayerNormRMSEpsilonKey]; ok { + ga.ClipVisionAttentionLayerNormRMSEpsilon = ValueNumeric[float32](v) + } if v, ok := m[visionImageSizeKey]; ok { ga.ClipVisionImageSize = ValueNumeric[uint32](v) } + if v, ok := m[visionProjectionDimKey]; ok { + ga.ClipVisionProjectionDim = ValueNumeric[uint32](v) + } + ga.ClipVisionProjectorScaleFactor = 1 + if ga.ClipProjectorType == "gemma3" { + ga.ClipVisionProjectorScaleFactor = 4 + } + if v, ok := m[visionProjectorScaleFactorKey]; ok { + ga.ClipVisionProjectorScaleFactor = ValueNumeric[uint32](v) + } + ga.ClipVisionPatchSize = 1 if v, ok := m[visionPatchSizeKey]; ok { ga.ClipVisionPatchSize = ValueNumeric[uint32](v) } - if v, ok := m[visionProjectionDim]; ok { - ga.ClipVisionProjectionDim = ValueNumeric[uint32](v) - } ga.ClipVisionMMPatchMergeType = "flat" - if v, ok := m[visionMMPatchMergeType]; ok { + if v, ok := m[visionMMPatchMergeTypeKey]; ok { ga.ClipVisionMMPatchMergeType = v.ValueString() } - - if v, ok := m[textEmbeddingLengthKey]; ok { - ga.EmbeddingLength = ValueNumeric[uint64](v) + if v, ok := m[visioSpatialMergeSizeKey]; ok { + ga.ClipVisionSpatialMergeSize = ValueNumeric[uint32](v) } - if v, ok := m[textBlockCountKey]; ok { - ga.BlockCount = ValueNumeric[uint64](v) + if v, ok := m[visionWindowAttentionPatternKey]; ok { + ga.ClipVisionWindowAttentionPattern = ValueNumeric[uint32](v) } - if v, ok := m[textFeedForwardLengthKey]; ok { - if v.ValueType == GGUFMetadataValueTypeArray { - ga.FeedForwardLength = ValuesNumeric[uint64](v.ValueArray()) - } else { - vx := ValueNumeric[uint64](v) - ga.FeedForwardLength = make([]uint64, ga.BlockCount) - for i := range ga.FeedForwardLength { - ga.FeedForwardLength[i] = vx - } - } + // Audio + if v, ok := m[hasAudioEncoderKey]; ok { + ga.ClipHasAudioEncoder = v.ValueBool() } - if v, ok := m[textAttentionHeadCountKey]; ok { - ga.AttentionHeadCount = ValueNumeric[uint64](v) + if v, ok := m[audioEmbeddingLengthKey]; ok { + ga.ClipAudioEmbeddingLength = ValueNumeric[uint64](v) } - if v, ok := m[textAttentionLayerNormRMSEpsilonKey]; ok { - ga.AttentionLayerNormRMSEpsilon = ValueNumeric[float32](v) - } - - if v, ok := m[visionEmbeddingLengthKey]; ok { - ga.EmbeddingLength = ValueNumeric[uint64](v) + if v, ok := m[audioBlockCountKey]; ok { + ga.ClipAudioBlockCount = ValueNumeric[uint64](v) } - if v, ok := m[visionBlockCountKey]; ok { - ga.BlockCount = ValueNumeric[uint64](v) - } - if v, ok := m[visionFeedForwardLengthKey]; ok { + if v, ok := m[audioFeedForwardLengthKey]; ok { if v.ValueType == GGUFMetadataValueTypeArray { - ga.FeedForwardLength = ValuesNumeric[uint64](v.ValueArray()) + ga.ClipAudioFeedForwardLength = ValuesNumeric[uint64](v.ValueArray()) } else { vx := ValueNumeric[uint64](v) - ga.FeedForwardLength = make([]uint64, ga.BlockCount) - for i := range ga.FeedForwardLength { - ga.FeedForwardLength[i] = vx + ga.ClipAudioFeedForwardLength = make([]uint64, ga.ClipAudioBlockCount) + for i := range ga.ClipAudioFeedForwardLength { + ga.ClipAudioFeedForwardLength[i] = vx } } } - if v, ok := m[visionAttentionHeadCountKey]; ok { - ga.AttentionHeadCount = ValueNumeric[uint64](v) + if v, ok := m[audioAttentionHeadCountKey]; ok { + ga.ClipAudioAttentionHeadCount = ValueNumeric[uint64](v) } - if v, ok := m[visionAttentionLayerNormRMSEpsilonKey]; ok { - ga.AttentionLayerNormRMSEpsilon = ValueNumeric[float32](v) + if v, ok := m[audioAttentionLayerNormRMSEpsilonKey]; ok { + ga.ClipAudioAttentionLayerNormRMSEpsilon = ValueNumeric[float32](v) + } + if v, ok := m[audioProjectionDimKey]; ok { + ga.ClipAudioProjectionDim = ValueNumeric[uint32](v) + } + ga.ClipAudioProjectorStackFactor = 1 + if v, ok := m[audioProjectorStackFactorKey]; ok { + ga.ClipAudioProjectorStackFactor = ValueNumeric[uint32](v) + } + if v, ok := m[audioNumMelBinsKey]; ok { + ga.ClipAudioNumMelBins = ValueNumeric[uint32](v) } ga.AttentionHeadCountKV = ga.AttentionHeadCount - { - if ga.AttentionHeadCountKV > 0 { - ga.EmbeddingGQA = ga.AttentionHeadCount / ga.AttentionHeadCountKV - } - if ga.AttentionHeadCount > 0 { - ga.EmbeddingKeyGQA = uint64(ga.AttentionKeyLength) * ga.AttentionHeadCountKV - ga.EmbeddingValueGQA = uint64(ga.AttentionValueLength) * ga.AttentionHeadCountKV - } - if ga.Architecture == "mamba" { - ga.EmbeddingKeyGQA = uint64((ga.SSMConvolutionKernel - 1) * ga.SSMInnerSize) - ga.EmbeddingValueGQA = uint64(ga.SSMStateSize * ga.SSMInnerSize) - } - } - return ga } @@ -597,6 +811,13 @@ func (gf *GGUFFile) adapterArchitecture(arch string) (ga GGUFArchitecture) { return ga } +func (gf *GGUFFile) imatrixArchitecture(_ string) (ga GGUFArchitecture) { + ga.Type = "imatrix" + ga.Architecture = "imatrix" + + return ga +} + func (gf *GGUFFile) transformerArchitecture(arch string) (ga GGUFArchitecture) { var ( contextLengthKey = arch + ".context_length" @@ -608,21 +829,28 @@ func (gf *GGUFFile) transformerArchitecture(arch string) (ga GGUFArchitecture) { expertSharedFeedForwardLengthKey = arch + ".expert_shared_feed_forward_length" expertCountKey = arch + ".expert_count" expertUsedCountKey = arch + ".expert_used_count" + expertSharedCountKey = arch + ".expert_shared_count" attentionHeadCountKey = arch + ".attention.head_count" attentionHeadCountKVKey = arch + ".attention.head_count_kv" + attentionSlidingWindowKey = arch + ".attention.sliding_window" attentionMaxALiBIBiasKey = arch + ".attention.max_alibi_bias" attentionMaxALiBIBiasKey2 = arch + ".attention.alibi_bias_max" attentionClampKQVKey = arch + ".attention.clamp_kqv" attentionClampKQVKey2 = arch + ".attention.clip_kqv" attentionLayerNormEpsilonKey = arch + ".attention.layer_norm_epsilon" attentionLayerNormRMSEpsilonKey = arch + ".attention.layer_norm_rms_epsilon" + attentionQueryLORARankKey = arch + ".attention.q_lora_rank" + attentionKeyValueLORARankKey = arch + ".attention.kv_lora_rank" attentionKeyLengthKey = arch + ".attention.key_length" + attentionKeyLengthMLAKey = arch + ".attention.key_length_mla" attentionValueLengthKey = arch + ".attention.value_length" + attentionValueLengthMLAKey = arch + ".attention.value_length_mla" attentionCausalKey = arch + ".attention.causal" ropeDimensionCountKey = arch + ".rope.dimension_count" ropeFrequencyBaseKey = arch + ".rope.freq_base" + ropeFrequencyScaleKey = arch + ".rope.freq_scale" ropeScaleLinearKey = arch + ".rope.scale_linear" ropeScalingTypeKey = arch + ".rope.scaling.type" ropeScalingFactorKey = arch + ".rope.scaling.factor" @@ -633,6 +861,13 @@ func (gf *GGUFFile) transformerArchitecture(arch string) (ga GGUFArchitecture) { ssmInnerSizeKey = arch + ".ssm.inner_size" ssmStateSizeKey = arch + ".ssm.state_size" ssmTimeStepRankKey = arch + ".ssm.time_step_rank" + ssmGroupCountKey = arch + ".ssm.group_count" + + rwkvHeadSizeKey = arch + ".wkv.head_size" + rwkvRescaleEveryNLayersKey = arch + ".rescale_every_n_layers" + rwkvTimeMixExtraDimensionKey = arch + ".time_mix_extra_dim" + rwkvTimeDecayExtraDimensionKey = arch + ".time_decay_extra_dim" + rwkvTokenShiftCountKey = arch + ".token_shift_count" vocabularyLengthKey = arch + ".vocab_size" tokenizerGGMLTokensKey = "tokenizer.ggml.tokens" @@ -650,19 +885,26 @@ func (gf *GGUFFile) transformerArchitecture(arch string) (ga GGUFArchitecture) { expertSharedFeedForwardLengthKey, expertCountKey, expertUsedCountKey, + expertSharedCountKey, attentionHeadCountKey, attentionHeadCountKVKey, + attentionSlidingWindowKey, attentionMaxALiBIBiasKey, attentionMaxALiBIBiasKey2, attentionClampKQVKey, attentionClampKQVKey2, attentionLayerNormEpsilonKey, attentionLayerNormRMSEpsilonKey, + attentionQueryLORARankKey, + attentionKeyValueLORARankKey, attentionKeyLengthKey, + attentionKeyLengthMLAKey, attentionValueLengthKey, + attentionValueLengthMLAKey, attentionCausalKey, ropeDimensionCountKey, ropeFrequencyBaseKey, + ropeFrequencyScaleKey, ropeScaleLinearKey, ropeScalingTypeKey, ropeScalingFactorKey, @@ -672,6 +914,12 @@ func (gf *GGUFFile) transformerArchitecture(arch string) (ga GGUFArchitecture) { ssmInnerSizeKey, ssmStateSizeKey, ssmTimeStepRankKey, + ssmGroupCountKey, + rwkvHeadSizeKey, + rwkvRescaleEveryNLayersKey, + rwkvTimeMixExtraDimensionKey, + rwkvTimeDecayExtraDimensionKey, + rwkvTokenShiftCountKey, vocabularyLengthKey, tokenizerGGMLTokensKey, }) @@ -703,6 +951,9 @@ func (gf *GGUFFile) transformerArchitecture(arch string) (ga GGUFArchitecture) { if v, ok := m[expertUsedCountKey]; ok { ga.ExpertUsedCount = ValueNumeric[uint32](v) } + if v, ok := m[expertSharedCountKey]; ok { + ga.ExpertSharedCount = ValueNumeric[uint32](v) + } if v, ok := m[expertFeedForwardLengthKey]; ok { ga.ExpertFeedForwardLength = ValueNumeric[uint64](v) } @@ -726,6 +977,33 @@ func (gf *GGUFFile) transformerArchitecture(arch string) (ga GGUFArchitecture) { } else { ga.AttentionHeadCountKV = ga.AttentionHeadCount } + ga.AttentionSlidingWindowPattern = 1 + if v, ok := m[attentionSlidingWindowKey]; ok { + if v.ValueType == GGUFMetadataValueTypeArray { + ga.AttentionSlidingWindow = ValuesNumeric[uint64](v.ValueArray())[0] + } else { + ga.AttentionSlidingWindow = ValueNumeric[uint64](v) + } + } + switch arch { + case "llama4": + if ga.AttentionSlidingWindow == 0 { + ga.AttentionSlidingWindow = 8192 + } + ga.AttentionSlidingWindowPattern = 4 + case "phi3": + // See https://github.com/ggml-org/llama.cpp/pull/13676 + ga.AttentionSlidingWindow = 0 + case "gemma2": + if ga.AttentionSlidingWindow == 0 { + ga.AttentionSlidingWindow = 4096 + } + ga.AttentionSlidingWindowPattern = 2 + case "gemma3": + ga.AttentionSlidingWindowPattern = 6 + case "cohere2": + ga.AttentionSlidingWindowPattern = 4 + } if v, ok := m[attentionMaxALiBIBiasKey]; ok { ga.AttentionMaxALiBIBias = ValueNumeric[float32](v) } else if v, ok := m[attentionMaxALiBIBiasKey2]; ok { @@ -742,37 +1020,76 @@ func (gf *GGUFFile) transformerArchitecture(arch string) (ga GGUFArchitecture) { if v, ok := m[attentionLayerNormRMSEpsilonKey]; ok { ga.AttentionLayerNormRMSEpsilon = ValueNumeric[float32](v) } + if v, ok := m[attentionQueryLORARankKey]; ok { + ga.AttentionQueryLORARank = ValueNumeric[uint32](v) + } + if v, ok := m[attentionKeyValueLORARankKey]; ok { + ga.AttentionKeyValueLORARank = ValueNumeric[uint32](v) + } if v, ok := m[attentionKeyLengthKey]; ok { ga.AttentionKeyLength = ValueNumeric[uint32](v) } else if ga.AttentionHeadCount != 0 { ga.AttentionKeyLength = uint32(ga.EmbeddingLength / ga.AttentionHeadCount) } + if v, ok := m[attentionKeyLengthMLAKey]; ok { + ga.AttentionKeyLengthMLA = ValueNumeric[uint32](v) + } if v, ok := m[attentionValueLengthKey]; ok { ga.AttentionValueLength = ValueNumeric[uint32](v) } else if ga.AttentionHeadCount != 0 { ga.AttentionValueLength = uint32(ga.EmbeddingLength / ga.AttentionHeadCount) } + if v, ok := m[attentionValueLengthMLAKey]; ok { + ga.AttentionValueLengthMLA = ValueNumeric[uint32](v) + } if v, ok := m[attentionCausalKey]; ok { ga.AttentionCausal = v.ValueBool() } else { ga.AttentionCausal = true } + // See https://github.com/ggml-org/llama.cpp/blob/6491d6e4f1caf0ad2221865b4249ae6938a6308c/src/llama-arch.cpp#L1913-L1924. + ga.AttentionRecurrent = slices.Contains([]string{ // TODO(thxCode): calculate this from the metadata. + "mamba", + "mamba2", + "rwkv6", + "rwkv6qwen2", + "rwkv7", + "arwkv7", + }, ga.Architecture) + // See https://github.com/ggml-org/llama.cpp/blob/a57d1bcb3c0165ac87b1f0dbb429839b0da69689/src/llama-arch.cpp#L2029-L2038. + ga.AttentionHybrid = slices.Contains([]string{ // TODO(thxCode): calculate this from the metadata. + "jamba", + "falcon-h1", + "granitehybrid", + }, ga.Architecture) + ga.AttentionRecurrent = ga.AttentionHybrid || ga.AttentionRecurrent if v, ok := m[ropeDimensionCountKey]; ok { ga.RoPEDimensionCount = ValueNumeric[uint64](v) } + ga.RoPEFrequencyBase = 10000.0 if v, ok := m[ropeFrequencyBaseKey]; ok { ga.RoPEFrequencyBase = ValueNumeric[float32](v) } - if v, ok := m[ropeScaleLinearKey]; ok { - ga.RoPEScalingType = "linear" - ga.RoPEScalingFactor = ValueNumeric[float32](v) + ga.RoPEFrequencyScale = 1.0 + if v, ok := m[ropeFrequencyScaleKey]; ok { + ga.RoPEFrequencyScale = ValueNumeric[float32](v) } if v, ok := m[ropeScalingTypeKey]; ok { ga.RoPEScalingType = v.ValueString() } + if v, ok := m[ropeScaleLinearKey]; ok { + ga.RoPEScalingType = "linear" + ga.RoPEScalingFactor = ValueNumeric[float32](v) + if ga.RoPEScalingFactor != 0 { + ga.RoPEFrequencyScale = 1.0 / ga.RoPEScalingFactor + } + } if v, ok := m[ropeScalingFactorKey]; ok { ga.RoPEScalingFactor = ValueNumeric[float32](v) + if ga.RoPEScalingFactor != 0 { + ga.RoPEFrequencyScale = 1.0 / ga.RoPEScalingFactor + } } if v, ok := m[ropeScalingOriginalContextKey]; ok { ga.RoPEScalingOriginalContextLength = ValueNumeric[uint64](v) @@ -793,6 +1110,27 @@ func (gf *GGUFFile) transformerArchitecture(arch string) (ga GGUFArchitecture) { if v, ok := m[ssmTimeStepRankKey]; ok { ga.SSMTimeStepRank = ValueNumeric[uint32](v) } + if v, ok := m[ssmGroupCountKey]; ok { + ga.SSMGroupCount = ValueNumeric[uint32](v) + } + + if v, ok := m[rwkvHeadSizeKey]; ok { + ga.RWKVHeadSize = ValueNumeric[uint32](v) + } + if v, ok := m[rwkvRescaleEveryNLayersKey]; ok { + ga.RWKVRescaleEveryNLayers = ValueNumeric[uint32](v) + } + if v, ok := m[rwkvTimeMixExtraDimensionKey]; ok { + ga.RWKVTimeMixExtraDimension = ValueNumeric[uint32](v) + } + if v, ok := m[rwkvTimeDecayExtraDimensionKey]; ok { + ga.RWKVTimeDecayExtraDimension = ValueNumeric[uint32](v) + } + if v, ok := m[rwkvTokenShiftCountKey]; ok { + ga.RWKVTokenShiftCount = ValueNumeric[uint32](v) + } else if ga.AttentionRecurrent { + ga.RWKVTokenShiftCount = 2 + } if v, ok := m[vocabularyLengthKey]; ok { ga.VocabularyLength = ValueNumeric[uint64](v) @@ -800,19 +1138,5 @@ func (gf *GGUFFile) transformerArchitecture(arch string) (ga GGUFArchitecture) { ga.VocabularyLength = v.ValueArray().Len } - { - if ga.AttentionHeadCountKV > 0 { - ga.EmbeddingGQA = ga.AttentionHeadCount / ga.AttentionHeadCountKV - } - if ga.AttentionHeadCount > 0 { - ga.EmbeddingKeyGQA = uint64(ga.AttentionKeyLength) * ga.AttentionHeadCountKV - ga.EmbeddingValueGQA = uint64(ga.AttentionValueLength) * ga.AttentionHeadCountKV - } - if ga.Architecture == "mamba" { - ga.EmbeddingKeyGQA = uint64((ga.SSMConvolutionKernel - 1) * ga.SSMInnerSize) - ga.EmbeddingValueGQA = uint64(ga.SSMStateSize * ga.SSMInnerSize) - } - } - return ga } diff --git a/vendor/github.com/gpustack/gguf-parser-go/file_estimate__llamacpp.go b/vendor/github.com/gpustack/gguf-parser-go/file_estimate__llamacpp.go index 5a49aa37..a47fb0ca 100644 --- a/vendor/github.com/gpustack/gguf-parser-go/file_estimate__llamacpp.go +++ b/vendor/github.com/gpustack/gguf-parser-go/file_estimate__llamacpp.go @@ -1,6 +1,7 @@ package gguf_parser import ( + "math" "regexp" "slices" "strings" @@ -73,7 +74,11 @@ type ( LLaMACppRunDeviceUsage struct { // HandleLayers is the number of layers that the device can handle. HandleLayers uint64 `json:"handleLayers"` - // HandleLastLayer is the index of the last layer the device can handle. + // HandleSWALayers is the number of layers that the device can handle in sliding window attention (SWA), + // the non SWA layers is `HandleLayers - HandleSWALayers`. + HandleSWALayers uint64 `json:"handleSWALayers"` + // HandleLastLayer is the index of the last layer the device can handle, + // -1 means the device does not handle the last layer. HandleLastLayer int `json:"handleLastLayer"` // HandleOutputLayer is the flag to indicate whether the device can handle the output layer, // true for handle. @@ -87,6 +92,8 @@ type ( // If Remote is true, Position is the position of the remote devices, // Otherwise, Position is the position of the device in the local devices. Position int `json:"position"` + // Endpoint is the endpoint of the remote device, empty for local devices. + Endpoint string `json:"endpoint,omitempty"` // Footprint is the memory footprint for bootstrapping. Footprint GGUFBytesScalar `json:"footprint"` // Parameter is the running parameters that the device processes. @@ -107,6 +114,8 @@ type ( Input GGUFParametersScalar `json:"input"` // Compute is the parameter usage for compute tensors. Compute GGUFParametersScalar `json:"compute"` + // ComputeOverridden is the parameter usage for overridden compute tensors. + ComputeOverridden GGUFParametersScalar `json:"computeOverridden"` // Output is the parameter usage for output tensors. Output GGUFParametersScalar `json:"output"` } @@ -117,6 +126,8 @@ type ( Input GGUFBytesScalar `json:"input"` // Compute is the memory usage for loading compute tensors. Compute GGUFBytesScalar `json:"compute"` + // ComputeOverridden is the memory usage for loading overridden compute tensors. + ComputeOverridden GGUFBytesScalar `json:"computeOverridden"` // Output is the memory usage for loading output tensors. Output GGUFBytesScalar `json:"output"` } @@ -142,7 +153,7 @@ type ( } ) -// EstimateLLaMACppRun returns the inference estimated result of the GGUF file. +// EstimateLLaMACppRun estimates the usages of the GGUF file in llama.cpp. func (gf *GGUFFile) EstimateLLaMACppRun(opts ...GGUFRunEstimateOption) (e LLaMACppRunEstimate) { // Options var o _GGUFRunEstimateOptions @@ -196,6 +207,7 @@ func (gf *GGUFFile) EstimateLLaMACppRun(opts ...GGUFRunEstimateOption) (e LLaMAC e.Devices[j+1].Remote = j < len(o.RPCServers) if e.Devices[j+1].Remote { e.Devices[j+1].Position = j + e.Devices[j+1].Endpoint = o.RPCServers[j] } else { e.Devices[j+1].Position = j - len(o.RPCServers) } @@ -215,22 +227,24 @@ func (gf *GGUFFile) EstimateLLaMACppRun(opts ...GGUFRunEstimateOption) (e LLaMAC case "projector": // For projector model, // see https://github.com/ggerganov/llama.cpp/blob/148ec970b62c3c5ae0a8bfdaad2fc237aaae350d/examples/llava/clip.cpp#L994-L1008. - if ptr.Deref(o.LMCOffloadLayers, a.BlockCount) != 0 { - // None model means full offload. - o.LMCOffloadLayers = ptr.To(a.BlockCount) + if ptr.Deref(o.LMCOffloadLayers, math.MaxUint64) != 0 { + // Full offload. + o.LMCOffloadLayers = ptr.To[uint64](math.MaxUint64) } else { - // None model means zero offload. + // Zero offload. o.LMCOffloadLayers = ptr.To[uint64](0) } gf.estimateLLaMACppRunInProjector(&o, &a, &e) case "adapter": - gf.estimateLLaMaCppRunInAdapter(&o, &a, &e) + gf.estimateLLaMACppRunInAdapter(&o, &a, &e) + case "imatrix": + gf.estimateLLaMACppRunInIMatrix(&o, &a, &e) } return e } -// estimateLLaMACppRunInModel estimates the inference result of the GGUF file in llama.cpp for model type, +// estimateLLaMACppRunInModel estimates the usages of the GGUF file for model, // including the usages of footprint, weight, KV cache, and computation. func (gf *GGUFFile) estimateLLaMACppRunInModel(o *_GGUFRunEstimateOptions, a *GGUFArchitecture, t *GGUFTokenizer, e *LLaMACppRunEstimate) { ls := gf.Layers() @@ -251,6 +265,9 @@ func (gf *GGUFFile) estimateLLaMACppRunInModel(o *_GGUFRunEstimateOptions, a *GG a.BlockCount = uint64(len(tfLs)) } + // Using sliding window attention. + usingSWA := a.AttentionSlidingWindowPattern != 1 && !o.LMCFullSizeSWACache + // Full offload: nLoadLayers == 0 && isOffloadOutputLayer // Zero offload: nOffloadLayers == 0 // Partial offload: !Full offload && !Zero offload @@ -260,7 +277,8 @@ func (gf *GGUFFile) estimateLLaMACppRunInModel(o *_GGUFRunEstimateOptions, a *GG nLoadLayers = a.BlockCount idxOutputDevice int - fullOffload, zeroOffload bool + fullOffload, zeroOffload bool + nSWALoadLayers, nSWAOffloadLayers uint64 ) { var isOffloadOutputLayer bool @@ -289,17 +307,25 @@ func (gf *GGUFFile) estimateLLaMACppRunInModel(o *_GGUFRunEstimateOptions, a *GG e.FullOffloaded = fullOffload e.OffloadLayers = nOffloadLayers - for i, j, offloadStart := 0, 0, len(tfLs)-int(nOffloadLayers); i < len(tfLs); i++ { + for i, j, offloadStart := uint64(0), 0, a.BlockCount-nOffloadLayers; i < a.BlockCount; i++ { switch { - case i < int(nLoadLayers): + case i < nLoadLayers: e.Devices[0].HandleLayers += 1 - e.Devices[0].HandleLastLayer = i + e.Devices[0].HandleLastLayer = int(i) + if usingSWA && (a.AttentionSlidingWindowPattern == 0 || i%uint64(a.AttentionSlidingWindowPattern) != 0) { + e.Devices[0].HandleSWALayers += 1 + nSWALoadLayers += 1 + } case i >= offloadStart: x := float64(i-offloadStart) / float64(nActualOffloadLayers) j = slicex.UpperBound(o.TensorSplitFraction, x) e.Devices[j+1].HandleLayers += 1 - e.Devices[j+1].HandleLastLayer = i - if fullOffload && i == len(tfLs)-1 { + e.Devices[j+1].HandleLastLayer = int(i) + if usingSWA && (a.AttentionSlidingWindowPattern == 0 || i%uint64(a.AttentionSlidingWindowPattern) != 0) { + e.Devices[j+1].HandleSWALayers += 1 + nSWAOffloadLayers += 1 + } + if fullOffload && i == a.BlockCount-1 { idxOutputDevice = j + 1 } } @@ -315,11 +341,6 @@ func (gf *GGUFFile) estimateLLaMACppRunInModel(o *_GGUFRunEstimateOptions, a *GG if a.Architecture == "grok" { o.FlashAttention = false } - // Attention key length must be equal to attention value length, - // see https://github.com/ggerganov/llama.cpp/blob/19d3c8293b1f61acbe2dab1d49a17950fd788a4a/src/llama.cpp#L9571-L9574. - if a.AttentionKeyLength != a.AttentionValueLength { - o.FlashAttention = false - } // Fallback to FP16 if the value type is quantized when disabling flash attention, // see https://github.com/ggerganov/llama.cpp/blob/19d3c8293b1f61acbe2dab1d49a17950fd788a4a/src/llama.cpp#L9576-L9579. if o.LMCCacheValueType.IsQuantized() && !o.FlashAttention { @@ -331,9 +352,21 @@ func (gf *GGUFFile) estimateLLaMACppRunInModel(o *_GGUFRunEstimateOptions, a *GG // Embedding. if !a.AttentionCausal { + ropeFrequencyBase := ptr.Deref(o.LMCRoPEFrequencyBase, a.RoPEFrequencyBase) + ropeFrequencyScale := ptr.Deref(o.LMCRoPEFrequencyScale, a.RoPEFrequencyScale) + ropeScalingType := ptr.Deref(o.LMCRoPEScalingType, a.RoPEScalingType) + ropeScalingOriginalContextSize := ptr.Deref(o.LMCRoPEScalingOriginalContextSize, int32(a.RoPEScalingOriginalContextLength)) + isRoPECustomized := ropeFrequencyBase != a.RoPEFrequencyBase || + ropeFrequencyScale != a.RoPEFrequencyScale || + ropeScalingType != a.RoPEScalingType || + (ropeScalingType == "yarn" && ropeScalingOriginalContextSize != int32(a.RoPEScalingOriginalContextLength)) + e.EmbeddingOnly = true + o.LMCContextSize = ptr.To(ptr.Deref(o.LMCContextSize, int32(a.MaximumContextLength))) // Set context size/physical batch size/logical batch size to the training context size. - o.LMCContextSize = ptr.To(min(int32(a.MaximumContextLength), ptr.Deref(o.LMCContextSize, int32(a.MaximumContextLength)))) + if !isRoPECustomized { + o.LMCContextSize = ptr.To(min(int32(a.MaximumContextLength), *o.LMCContextSize)) + } o.LMCLogicalBatchSize = o.LMCContextSize o.LMCPhysicalBatchSize = o.LMCLogicalBatchSize // Reranking. @@ -350,15 +383,21 @@ func (gf *GGUFFile) estimateLLaMACppRunInModel(o *_GGUFRunEstimateOptions, a *GG e.LogicalBatchSize = *o.LMCLogicalBatchSize e.PhysicalBatchSize = *o.LMCPhysicalBatchSize + // Padding alignment. + paddingAlign := uint64(32) + if o.FlashAttention { + paddingAlign = 256 + } + // Init hyperparameters, // see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L6957-L7000. var ( - nContext uint64 - nTokens uint64 - nBatch uint64 - nOutputs uint64 - nParallel uint64 - nKV uint64 + nContext uint64 + nTokens uint64 + nBatch uint64 + nOutputs uint64 + nSeq uint64 + nKV uint64 ) { nContext = a.MaximumContextLength @@ -370,27 +409,16 @@ func (gf *GGUFFile) estimateLLaMACppRunInModel(o *_GGUFRunEstimateOptions, a *GG } // Padding context size, // see https://github.com/ggerganov/llama.cpp/blob/278d0e18469aacf505be18ce790a63c7cc31be26/src/llama.cpp#L19001-L19002. - if o.FlashAttention { - nContext = GGMLPadding(nContext, 256) - } else { - nContext = GGMLPadding(nContext, 32) - } + nContext = GGMLPadding(nContext, paddingAlign) + // Correct token size, // see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L12221-L12224. nTokens = min(nContext, uint64(*o.LMCPhysicalBatchSize)) nBatch = nTokens nOutputs = nTokens - nParallel = uint64(ptr.Deref(o.ParallelSize, 1)) + nSeq = uint64(ptr.Deref(o.ParallelSize, 1)) nKV = nContext - // For mamba, - // see https://github.com/ggerganov/llama.cpp/blob/7672adeec7a79ea271058c63106c142ba84f951a/llama.cpp#L16122-L16129. - if a.Architecture == "mamba" { - nKV = nParallel - o.LMCCacheKeyType = ptr.To(GGMLTypeF32) - o.LMCCacheValueType = ptr.To(GGMLTypeF32) - } - e.ContextSize = nContext } @@ -410,7 +438,10 @@ func (gf *GGUFFile) estimateLLaMACppRunInModel(o *_GGUFRunEstimateOptions, a *GG // Output buffer, // see https://github.com/ggerganov/llama.cpp/blob/7672adeec7a79ea271058c63106c142ba84f951a/llama.cpp#L11940-L12003. - ob := 4 /* float32 size */ * (a.VocabularyLength + a.EmbeddingLength) * nParallel + ob := a.EmbeddingLength * nOutputs * 4 /* float32 size */ + if a.AttentionCausal { + ob += a.VocabularyLength * nOutputs * 4 /* float32 size */ + } if fullOffload { e.Devices[idxOutputDevice].Footprint += GGUFBytesScalar(ob) } else { @@ -420,6 +451,66 @@ func (gf *GGUFFile) estimateLLaMACppRunInModel(o *_GGUFRunEstimateOptions, a *GG // Weight & Parameter. { + filter := func(idx int) GGUFTensorInfoFilter { + if len(o.OverriddenTensors) == 0 { + return nil + } + return func(name string) bool { + for _, ot := range o.OverriddenTensors { + bt, bi := ot.ParseBufferType() + switch { + case bt == GGUFRunOverriddenTensorBufferTypeUnknown: + continue + case bt == GGUFRunOverriddenTensorBufferTypeCPU && idx == 0: + continue + case bt == GGUFRunOverriddenTensorBufferTypeGPU && + (e.Devices[idx].Remote || anyx.Number[int](bi)+1 != idx): + continue + case bt == GGUFRunOverriddenTensorBufferTypeRPC && + (!e.Devices[idx].Remote || e.Devices[idx].Endpoint != bi): + continue + } + if ot.PatternRegex.MatchString(name) { + return false + } + } + return true + } + } + + // If overridden tensors are provided, + // we need to search the tensors of the overridden pattern, + // and place them in the correct device. + if len(o.OverriddenTensors) != 0 { + for _, ot := range o.OverriddenTensors { + bt, bi := ot.ParseBufferType() + if bt == GGUFRunOverriddenTensorBufferTypeUnknown { + continue + } + var sls GGUFTensorInfos = ls.Search(ot.PatternRegex) + if len(sls) == 0 { + continue + } + switch bt { + case GGUFRunOverriddenTensorBufferTypeCPU: + e.Devices[0].Weight.ComputeOverridden += GGUFBytesScalar(sls.Bytes()) + e.Devices[0].Parameter.ComputeOverridden += GGUFParametersScalar(sls.Elements()) + case GGUFRunOverriddenTensorBufferTypeGPU: + idx := anyx.Number[int](bi) + 1 + e.Devices[idx].Weight.ComputeOverridden += GGUFBytesScalar(sls.Bytes()) + e.Devices[idx].Parameter.ComputeOverridden += GGUFParametersScalar(sls.Elements()) + default: + for i, d := range e.Devices[1:] { + if d.Endpoint == bi { + e.Devices[i+1].Weight.ComputeOverridden += GGUFBytesScalar(sls.Bytes()) + e.Devices[i+1].Parameter.ComputeOverridden += GGUFParametersScalar(sls.Elements()) + break + } + } + } + } + } + // Compute. for i, j, offloadStart := 0, 0, len(tfLs)-int(nOffloadLayers); i < len(tfLs); i++ { idx := 0 @@ -428,8 +519,9 @@ func (gf *GGUFFile) estimateLLaMACppRunInModel(o *_GGUFRunEstimateOptions, a *GG j = slicex.UpperBound(o.TensorSplitFraction, x) idx = j + 1 } - e.Devices[idx].Weight.Compute += GGUFBytesScalar(tfLs[i].Bytes()) - e.Devices[idx].Parameter.Compute += GGUFParametersScalar(tfLs[i].Elements()) + f := filter(idx) + e.Devices[idx].Weight.Compute += GGUFBytesScalar(tfLs[i].Bytes(f)) + e.Devices[idx].Parameter.Compute += GGUFParametersScalar(tfLs[i].Elements(f)) } // IO, @@ -443,7 +535,7 @@ func (gf *GGUFFile) estimateLLaMACppRunInModel(o *_GGUFRunEstimateOptions, a *GG if _, ok := opLs.Get("output.weight"); ok { wg = GGUFBytesScalar(opLs.Bytes()) ps = GGUFParametersScalar(opLs.Elements()) - } else if a.AttentionCausal { + } else { wg = GGUFBytesScalar(opLs.Bytes()) + e.Devices[0].Weight.Input /* duplicate the input layer */ ps = GGUFParametersScalar(opLs.Elements() + ipLs.Elements()) } @@ -456,34 +548,105 @@ func (gf *GGUFFile) estimateLLaMACppRunInModel(o *_GGUFRunEstimateOptions, a *GG } } - // KV cache, - // see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L2479-L2501. - { - kps, vps := a.EmbeddingKeyGQA*nKV, a.EmbeddingValueGQA*nKV - krs, vrs := o.LMCCacheKeyType.RowSizeOf([]uint64{kps}), o.LMCCacheValueType.RowSizeOf([]uint64{vps}) - - e.Devices[0].KVCache.Key = GGUFBytesScalar(krs * nLoadLayers) - e.Devices[0].KVCache.Value = GGUFBytesScalar(vrs * nLoadLayers) - e.Devices[0].Parameter.KVCache = GGUFParametersScalar((kps + vps) * nLoadLayers) - if !*o.LMCOffloadKVCache { - e.Devices[0].KVCache.Key += GGUFBytesScalar(krs * nOffloadLayers) - e.Devices[0].KVCache.Value += GGUFBytesScalar(vrs * nOffloadLayers) - e.Devices[0].Parameter.KVCache += GGUFParametersScalar((kps + vps) * nOffloadLayers) - } else if !zeroOffload { - for i, d := range e.Devices[1:] { - e.Devices[i+1].KVCache.Key = GGUFBytesScalar(krs * d.HandleLayers) - e.Devices[i+1].KVCache.Value = GGUFBytesScalar(vrs * d.HandleLayers) - e.Devices[i+1].Parameter.KVCache = GGUFParametersScalar((kps + vps) * d.HandleLayers) + // KV cache. + if a.AttentionCausal { + switch { + // Recurrent, + // see https://github.com/ggml-org/llama.cpp/blob/704bb7a71c01dc07c1478b85f6322bf5dfde1eaf/src/llama-hparams.cpp#L68-L88. + case a.AttentionRecurrent: + var r, s uint64 + if a.RWKVHeadSize > 0 { + r = uint64(a.RWKVTokenShiftCount) * a.EmbeddingLength + s = uint64(a.RWKVHeadSize) * a.EmbeddingLength + } else { + r = uint64((a.SSMConvolutionKernel - 1) * (a.SSMInnerSize + 2*a.SSMGroupCount*a.SSMStateSize)) + s = uint64(a.SSMStateSize * a.SSMInnerSize) + } + + rps, sps := r*nSeq, s*nSeq + rrs, srs := GGMLTypeF32.RowSizeOf([]uint64{rps}), GGMLTypeF32.RowSizeOf([]uint64{sps}) + + e.Devices[0].KVCache.Key += GGUFBytesScalar(rrs * nLoadLayers) + e.Devices[0].KVCache.Value += GGUFBytesScalar(srs * nLoadLayers) + e.Devices[0].Parameter.KVCache += GGUFParametersScalar((rrs + srs) * nLoadLayers) + if !*o.LMCOffloadKVCache { + e.Devices[0].KVCache.Key += GGUFBytesScalar(rrs * nOffloadLayers) + e.Devices[0].KVCache.Value += GGUFBytesScalar(srs * nOffloadLayers) + e.Devices[0].Parameter.KVCache += GGUFParametersScalar((rrs + srs) * nOffloadLayers) + } else if !zeroOffload { + for i, d := range e.Devices[1:] { + e.Devices[i+1].KVCache.Key += GGUFBytesScalar(rrs * d.HandleLayers) + e.Devices[i+1].KVCache.Value += GGUFBytesScalar(srs * d.HandleLayers) + e.Devices[i+1].Parameter.KVCache += GGUFParametersScalar((rrs + srs) * d.HandleLayers) + } + } + + if !a.AttentionHybrid { + break + } + + fallthrough + // Causal, + // see https://github.com/ggml-org/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L2479-L2501. + default: + akl, avl := uint64(a.AttentionKeyLength), uint64(a.AttentionValueLength) + if a.AttentionKeyLengthMLA > 0 && a.AttentionValueLengthMLA > 0 { + akl, avl = uint64(a.AttentionKeyLengthMLA), uint64(a.AttentionValueLengthMLA) + } + kGQA := akl * a.AttentionHeadCountKV + vGQA := avl * a.AttentionHeadCountKV + kps, vps := kGQA*nKV, vGQA*nKV + krs, vrs := o.LMCCacheKeyType.RowSizeOf([]uint64{kps}), o.LMCCacheValueType.RowSizeOf([]uint64{vps}) + + if !usingSWA { + e.Devices[0].KVCache.Key += GGUFBytesScalar(krs * nLoadLayers) + e.Devices[0].KVCache.Value += GGUFBytesScalar(vrs * nLoadLayers) + e.Devices[0].Parameter.KVCache += GGUFParametersScalar((kps + vps) * nLoadLayers) + if !*o.LMCOffloadKVCache { + e.Devices[0].KVCache.Key += GGUFBytesScalar(krs * nOffloadLayers) + e.Devices[0].KVCache.Value += GGUFBytesScalar(vrs * nOffloadLayers) + e.Devices[0].Parameter.KVCache += GGUFParametersScalar((kps + vps) * nOffloadLayers) + } else if !zeroOffload { + for i, d := range e.Devices[1:] { + e.Devices[i+1].KVCache.Key += GGUFBytesScalar(krs * d.HandleLayers) + e.Devices[i+1].KVCache.Value += GGUFBytesScalar(vrs * d.HandleLayers) + e.Devices[i+1].Parameter.KVCache += GGUFParametersScalar((kps + vps) * d.HandleLayers) + } + } + } else { + // Sliding window attention size, + // see https://github.com/ggml-org/llama.cpp/blob/3079e9ac8e04ef6eddeb0c164d72edb6b6fd2df5/src/llama-kv-cache.cpp#L1640-L1642. + swas := min(nKV, GGMLPadding(a.AttentionSlidingWindow*nSeq+uint64(*o.LMCLogicalBatchSize), paddingAlign)) + swaKps, swaVps := kGQA*swas, vGQA*swas + swaKrs, swaVrs := o.LMCCacheKeyType.RowSizeOf([]uint64{swaKps}), o.LMCCacheValueType.RowSizeOf([]uint64{swaVps}) + + nNonSWALoadLayers, nNonSWAOffloadLayers := nLoadLayers-nSWALoadLayers, nOffloadLayers-nSWAOffloadLayers + + e.Devices[0].KVCache.Key += GGUFBytesScalar(swaKrs*nSWALoadLayers + krs*nNonSWALoadLayers) + e.Devices[0].KVCache.Value += GGUFBytesScalar(swaVrs*nSWALoadLayers + vrs*nNonSWALoadLayers) + e.Devices[0].Parameter.KVCache += GGUFParametersScalar((swaKps+swaVps)*nSWALoadLayers + (kps+vps)*nNonSWALoadLayers) + if !*o.LMCOffloadKVCache { + e.Devices[0].KVCache.Key += GGUFBytesScalar(swaKrs*nSWAOffloadLayers + krs*nNonSWAOffloadLayers) + e.Devices[0].KVCache.Value += GGUFBytesScalar(swaVrs*nSWAOffloadLayers + vrs*nNonSWAOffloadLayers) + e.Devices[0].Parameter.KVCache += GGUFParametersScalar((swaKps+swaVps)*nSWAOffloadLayers + (kps+vps)*nNonSWAOffloadLayers) + } else if !zeroOffload { + for i, d := range e.Devices[1:] { + e.Devices[i+1].KVCache.Key += GGUFBytesScalar(swaKrs*d.HandleSWALayers + krs*(d.HandleLayers-d.HandleSWALayers)) + e.Devices[i+1].KVCache.Value += GGUFBytesScalar(swaVrs*d.HandleSWALayers + vrs*(d.HandleLayers-d.HandleSWALayers)) + e.Devices[i+1].Parameter.KVCache += GGUFParametersScalar((swaKps+swaVps)*d.HandleSWALayers + (kps+vps)*(d.HandleLayers-d.HandleSWALayers)) + } + } } } } // Computation. { - // Bootstrap, compute metadata, - // see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L16135-L16136. - cm := GGMLTensorOverhead()*GGMLComputationGraphNodesMaximum + - GGMLComputationGraphOverhead(GGMLComputationGraphNodesMaximum, false) + // See https://github.com/ggml-org/llama.cpp/blob/ec9e0301fef6476df83e94842c3b625501c95566/src/llama-context.cpp#L1241-L1243. + maxNodes := max(1024, uint64(8*len(gf.TensorInfos))) + + // Bootstrap, compute metadata. + cm := GGMLTensorOverhead()*maxNodes + GGMLComputationGraphOverhead(maxNodes, false) e.Devices[0].Computation.Footprint = GGUFBytesScalar(cm) // Scheduler overhead, @@ -506,21 +669,19 @@ func (gf *GGUFFile) estimateLLaMACppRunInModel(o *_GGUFRunEstimateOptions, a *GG inpPos = GGMLTypeI32.RowSizeOf([]uint64{nBatch}) // I32 [n_batch] inpOutIds = GGMLTypeI32.RowSizeOf([]uint64{nOutputs}) // I32 [n_outputs], inpKQMask = GGMLTypeF32.RowSizeOf([]uint64{nKV, nBatch}) // F32 [n_kv, n_batch] - inpSMask = GGMLTypeF32.RowSizeOf([]uint64{1, nKV}) // F32 [1, n_kv] - inpSSeq = GGMLTypeI32.RowSizeOf([]uint64{nKV, nBatch}) // I32 [n_kv, n_batch] + inpSMask = GGMLTypeF32.RowSizeOf([]uint64{1, nSeq}) // F32 [1, n_seq] + inpSSeq = GGMLTypeI32.RowSizeOf([]uint64{nSeq, nBatch}) // I32 [n_seq, n_batch] ) - switch { - case a.Architecture == "mamba": - e.Devices[0].Computation.Input = GGUFBytesScalar(inpTokens + inpEmbd + inpSMask + inpSSeq + inpOutIds) - default: + if a.AttentionRecurrent { + e.Devices[0].Computation.Input = GGUFBytesScalar(inpTokens + inpEmbd + 2*inpSMask + inpSSeq + inpOutIds) + } else { e.Devices[0].Computation.Input = GGUFBytesScalar(inpTokens + inpEmbd + inpPos + inpKQMask + inpOutIds) } - if !zeroOffload { + { var v GGUFBytesScalar - switch { - case a.Architecture == "mamba": + if a.AttentionRecurrent { v = GGUFBytesScalar(inpEmbd + inpSMask + inpSSeq) - default: + } else { v = GGUFBytesScalar(inpEmbd + inpPos + inpKQMask) } if len(o.RPCServers) == 0 && len(o.TensorSplitFraction) > 1 { @@ -538,35 +699,61 @@ func (gf *GGUFFile) estimateLLaMACppRunInModel(o *_GGUFRunEstimateOptions, a *GG // the allocated memory can be reused for the next layer. // So, we only consider the usage of the largest layer, // which is the last layer by default. - switch { - case a.Architecture == "mamba": - convInc := GGMLTypeF32.RowSizeOf([]uint64{a.EmbeddingKeyGQA, nKV}) // F32 [n_embd_key_gqa, n_kv] reshape - for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.(attn_norm|ssm_in|ssm_conv1d)\.weight`)) { - if !strings.HasSuffix(l.Name, ".ssm_conv1d.weight") { - rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens}) + if a.AttentionRecurrent && !a.AttentionHybrid { + if a.RWKVHeadSize > 0 { + attnInc := uint64(0) + for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.(attn_norm|attn_norm_2)\.weight`)) { + rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nBatch}) + attnInc += rs + } + ffnInc := uint64(0) + for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.time_mix_(lerp_x|receptance|decay_w2|key|value|gate|w2|output)\.weight`)) { // nolint: lll + switch { + case strings.HasSuffix(l.Name, ".time_mix_w2.weight"): + rs := GGMLTypeF32.RowSizeOf([]uint64{a.EmbeddingLength, 1, nTokens, l.Dimensions[l.NDimensions-1]}) + ffnInc += rs + case strings.HasSuffix(l.Name, ".time_mix_output.weight"): + rs := GGMLTypeF32.RowSizeOf([]uint64{a.EmbeddingLength, nBatch + uint64(a.RWKVHeadSize)*nSeq}) + ffnInc += rs + default: + rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nBatch}) + ffnInc += rs + } + } + cp := GGUFBytesScalar(attnInc + ffnInc) + for i := range e.Devices[1:] { + e.Devices[i+1].Computation.Compute = cp + } + } else { + r := uint64((a.SSMConvolutionKernel - 1) * (a.SSMInnerSize + 2*a.SSMGroupCount*a.SSMStateSize)) + convInc := GGMLTypeF32.RowSizeOf([]uint64{r, nSeq}) // F32 [n_embd_key_gqa, nSeq] reshape + for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.(attn_norm|ssm_in|ssm_conv1d)\.weight`)) { + if !strings.HasSuffix(l.Name, ".ssm_conv1d.weight") { + rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens}) + convInc += rs + continue + } + // https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L10379. + rs := GGMLTypeF32.RowSizeOf([]uint64{uint64(a.SSMInnerSize)*nTokens + uint64(a.SSMConvolutionKernel)*uint64(a.SSMInnerSize)*nSeq}) convInc += rs - continue } - // https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L10379. - rs := GGMLTypeF32.RowSizeOf([]uint64{uint64(a.SSMInnerSize)*nTokens + uint64(a.SSMConvolutionKernel)*uint64(a.SSMInnerSize)*nKV}) - convInc += rs - } - ssmInc := uint64(0) - for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.ssm_(dt\.weight|a)`)) { - if !strings.HasSuffix(l.Name, ".ssm_a") { - rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens}) + ssmInc := uint64(0) + for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.ssm_(dt\.weight|a)`)) { + if !strings.HasSuffix(l.Name, ".ssm_a") { + rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens}) + ssmInc += rs + continue + } + // https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L10413. + rs := GGMLTypeF32.RowSizeOf([]uint64{uint64(a.SSMInnerSize)*nTokens + uint64(a.SSMStateSize)*uint64(a.SSMInnerSize)*nSeq}) ssmInc += rs - continue } - // https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L10413. - rs := GGMLTypeF32.RowSizeOf([]uint64{uint64(a.SSMInnerSize)*nTokens + uint64(a.SSMStateSize)*uint64(a.SSMInnerSize)*nKV}) - ssmInc += rs - } - cp := GGUFBytesScalar(convInc + ssmInc) - for i := range e.Devices[1:] { - e.Devices[i+1].Computation.Compute = cp + cp := GGUFBytesScalar(convInc + ssmInc) + for i := range e.Devices[1:] { + e.Devices[i+1].Computation.Compute = cp + } } - default: + } else { loadAttnInc, offloadAttnInc := uint64(0), uint64(0) { rs := o.LMCCacheKeyType.RowSizeOf([]uint64{uint64(a.AttentionKeyLength), nKV, a.AttentionHeadCountKV}) @@ -577,7 +764,7 @@ func (gf *GGUFFile) estimateLLaMACppRunInModel(o *_GGUFRunEstimateOptions, a *GG if o.FlashAttention { // https://github.com/ggerganov/llama.cpp/blob/172c8256840ffd882ab9992ecedbb587d9b21f15/llama.cpp#L7387. offloadAttnInc = GGMLTypeF16.RowSizeOf([]uint64{nKV, nTokens}) - for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.attn_(norm|q|qkv)\.weight`)) { + for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.attn_(norm|q|qkv|q_b)\.weight`)) { if strings.HasSuffix(l.Name, ".attn_norm.weight") { rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens}) offloadAttnInc += rs @@ -644,7 +831,7 @@ func (gf *GGUFFile) estimateLLaMACppRunInModel(o *_GGUFRunEstimateOptions, a *GG } else { e.Devices[0].Computation.Compute = GGUFBytesScalar(loadAttnInc) } - if !zeroOffload { + { cp := GGUFBytesScalar(max(offloadAttnInc, ffnInc)) for i := range e.Devices[1:] { e.Devices[i+1].Computation.Compute = cp @@ -663,9 +850,13 @@ func (gf *GGUFFile) estimateLLaMACppRunInModel(o *_GGUFRunEstimateOptions, a *GG // Finally, get the usage of output layer. if a.AttentionCausal { var outInc uint64 - if a.Architecture == "mamba" { + if a.AttentionRecurrent { outInc += inpSMask + inpSSeq } + if l, ok := opLs.Get("output_norm.weight"); ok { + rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens}) + outInc += rs + } if l, ok := opLs.Get("output.weight"); ok { rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens}) outInc += rs @@ -692,7 +883,7 @@ func (gf *GGUFFile) estimateLLaMACppRunInModel(o *_GGUFRunEstimateOptions, a *GG bs := anyx.Number[float64](*o.LMCLogicalBatchSize) / float64(nBatch) for i, dm := range dmss { fl, upbw, dwbw := float64(max(dm.FLOPS, 1)), float64(max(dm.UpBandwidth, 1)), float64(max(dm.DownBandwidth, 1)) - cmpops := float64(ds[i].Parameter.Compute)*2 /* FMA */ *bs + float64(ds[i].Parameter.Input) + float64(ds[i].Parameter.Output) + cmpops := float64(ds[i].Parameter.Compute+ds[i].Parameter.ComputeOverridden)*2 /* FMA */ *bs + float64(ds[i].Parameter.Input) + float64(ds[i].Parameter.Output) // nolint: lll cmps := float64(ds[i].Weight.Sum()) cmplat := max(cmpops/fl, cmps/upbw) kvcops := float64(ds[i].Parameter.KVCache) * 2 /* FMA */ * bs @@ -715,83 +906,161 @@ func (gf *GGUFFile) estimateLLaMACppRunInModel(o *_GGUFRunEstimateOptions, a *GG } } +// estimateLLaMACppRunInProjector estimates the usages of the GGUF file for projector. func (gf *GGUFFile) estimateLLaMACppRunInProjector(o *_GGUFRunEstimateOptions, a *GGUFArchitecture, e *LLaMACppRunEstimate) { ls := gf.Layers() ioLs, tfLs, _ := ls.Cut([]string{ + "mm.*", + // Vision specific IO layers. "v.patch_embd.*", "v.class_embd", "v.position_embd.*", "v.pre_ln.*", - "model.*", "v.post_ln.*", - "mm.*", + "model.*", "resampler.*", + // Audio specific IO layers. + "a.position_embd.*", + "a.conv1d.*", + "a.post_ln.*", }) ipLs, opLs, _ := ioLs.Cut([]string{ + // Vision specific Input layers. "v.patch_embd.*", "v.class_embd", "v.position_embd.*", "v.pre_ln.*", "model.*", + // Audio specific Input layers. + "a.position_embd.*", + "a.conv1d.*", }) - if a.BlockCount == 0 { - a.BlockCount = uint64(len(tfLs)) + // Block count. + if a.ClipHasVisionEncoder && a.ClipVisionBlockCount == 0 { + if len(tfLs) == 1 { + if ntfLs, ok := tfLs[0].(*GGUFNamedTensorInfos); ok && slices.Contains([]string{"v"}, ntfLs.Name) { + a.ClipVisionBlockCount = uint64(len(ntfLs.GGUFLayerTensorInfos)) + } + } + if a.ClipVisionBlockCount == 0 { + a.ClipVisionBlockCount = uint64(len(tfLs)) + } + } + if a.ClipHasAudioEncoder && a.ClipAudioBlockCount == 0 { + if len(tfLs) == 1 { + if ntfLs, ok := tfLs[0].(*GGUFNamedTensorInfos); ok && slices.Contains([]string{"a"}, ntfLs.Name) { + a.ClipAudioBlockCount = uint64(len(ntfLs.GGUFLayerTensorInfos)) + } + } + if a.ClipAudioBlockCount == 0 { + a.ClipAudioBlockCount = uint64(len(tfLs)) + } } - e.FullOffloaded = *o.LMCOffloadLayers == a.BlockCount - e.OffloadLayers = *o.LMCOffloadLayers + // Offload layers. + if *o.LMCOffloadLayers == math.MaxUint64 { + e.FullOffloaded = true + e.OffloadLayers = a.ClipVisionBlockCount + a.ClipAudioBlockCount + o.LMCOffloadLayers = ptr.To(e.OffloadLayers) + } else { + e.FullOffloaded = false + e.OffloadLayers = 0 + } - // Init hyperparameters, - // see https://github.com/ggerganov/llama.cpp/blob/0827b2c1da299805288abbd556d869318f2b121e/examples/llava/clip.cpp#L599-L636. - var ( - imgHeightSize uint64 - imgWidthSize uint64 - imgPatchSize uint64 - nPatchesHeight uint64 - nPatchesWidth uint64 - nPatches uint64 - imgPatchesMaxSize uint64 - imgPatches uint64 - projectionDim uint64 // NB(thxCode): do not sure if there is the correct name. - ) + // Footprint. { + // Bootstrap. + e.Devices[0].Footprint = GGUFBytesScalar(5*1024*1024) /* model load */ + (gf.Size - gf.ModelSize) /* metadata */ + } + + idx := 0 // Default to the main host's RAM. + if e.FullOffloaded { + for i := 1; i < len(e.Devices); i++ { + if !e.Devices[i].Remote { + idx = i + break + } + } + } + + // Weight & Parameter. + { + // Compute. + e.Devices[idx].HandleLayers = *o.LMCOffloadLayers + e.Devices[idx].HandleLastLayer = int(e.Devices[idx].HandleLayers - 1) + e.Devices[idx].Weight.Compute = GGUFBytesScalar(tfLs.Bytes()) + e.Devices[idx].Parameter.Compute = GGUFParametersScalar(tfLs.Elements()) + + // IO. + e.Devices[idx].Weight.Input = GGUFBytesScalar(ipLs.Bytes()) + e.Devices[idx].Parameter.Input = GGUFParametersScalar(ipLs.Elements()) + e.Devices[idx].Weight.Output = GGUFBytesScalar(opLs.Bytes()) + e.Devices[idx].Parameter.Output = GGUFParametersScalar(opLs.Elements()) + } + + if a.ClipHasVisionEncoder { + // Init hyperparameters, + // see https://github.com/ggerganov/llama.cpp/blob/0827b2c1da299805288abbd556d869318f2b121e/examples/llava/clip.cpp#L599-L636. + var ( + heightMaxSize uint64 // y + widthMaxSize uint64 // x + // See https://github.com/ggml-org/llama.cpp/blob/6385b843a8dc8e15b8362196039720c58dd79fa2/tools/mtmd/clip.cpp#L3462. + nPatches uint64 + patchesMaxSize uint64 + // See https://github.com/ggml-org/llama.cpp/blob/6385b843a8dc8e15b8362196039720c58dd79fa2/tools/mtmd/clip.cpp#L4016. + projectionDim uint64 // NB(thxCode): do not sure if there is the correct name. + ) // See https://github.com/ggerganov/llama.cpp/blob/0827b2c1da299805288abbd556d869318f2b121e/examples/llava/llava.cpp#L397-L411, // https://github.com/ggerganov/llama.cpp/blob/0827b2c1da299805288abbd556d869318f2b121e/examples/llava/clip.cpp#L2323-L2345, // https://github.com/ggerganov/llama.cpp/blob/0827b2c1da299805288abbd556d869318f2b121e/examples/llava/clip.cpp#L2767-L2794. - imgHeightSize = uint64(a.ClipVisionImageSize) - imgWidthSize = imgHeightSize - imgPatchSize = uint64(a.ClipVisionPatchSize) - if a.ClipHasQwen2VLMerger { - imgHeightSize = uint64(ptr.Deref(o.LMCVisualMaxImageSize, 224)) - imgWidthSize = imgHeightSize + heightMaxSize = uint64(a.ClipVisionImageSize) + widthMaxSize = heightMaxSize + if a.ClipHasQwen2VLMerger || + a.ClipProjectorType == "qwen2vl_merger" || + a.ClipProjectorType == "qwen2.5vl_merger" || + a.ClipProjectorType == "qwen2.5o" || + a.ClipProjectorType == "pixtral" { + // See https://github.com/ggml-org/llama.cpp/blob/ec9e0301fef6476df83e94842c3b625501c95566/tools/mtmd/clip.cpp#L2217. + heightMaxSize = uint64(ptr.Deref(o.LMCVisualMaxImageSize, 1024)) + widthMaxSize = heightMaxSize } - nPatchesHeight = imgHeightSize / imgPatchSize - nPatchesWidth = imgWidthSize / imgPatchSize + nPatchSize := uint64(a.ClipVisionPatchSize) + nPatchesHeight := heightMaxSize / nPatchSize + nPatchesWidth := widthMaxSize / nPatchSize nPatches = nPatchesHeight * nPatchesWidth - imgPatchesMaxSize = 1 - imgPatches = nPatches + patchesMaxSize = 1 switch { - case a.ClipHasLLaVAProjector: + case a.ClipHasLLaVAProjector || + a.ClipProjectorType == "mlp" || + a.ClipProjectorType == "mlp_norm" || + a.ClipProjectorType == "ldp" || + a.ClipProjectorType == "ldpv2": // LLaVA 1.6 uses up to 6 patches if a.ClipVisionMMPatchMergeType != "flat" { - imgPatchesMaxSize = 6 + patchesMaxSize = 6 } - case a.ClipHasMiniCPMVProjector: + case a.ClipHasMiniCPMVProjector || + a.ClipProjectorType == "resampler": // MiniCPM-V uses up to 10 patches - imgPatchesMaxSize = 10 + patchesMaxSize = 10 case a.ClipProjectorType == "adapter": // Granite vision uses up to 10 patches + base patch - imgPatchesMaxSize = 11 + patchesMaxSize = 11 + } + + if o.LMCMaxProjectedCache != nil { + patchesMaxSize += uint64(*o.LMCMaxProjectedCache) } + switch a.ClipProjectorType { case "ldp": - imgPatches /= 4 + nPatches /= 4 if ti, ok := gf.TensorInfos.Get("mm.model.mb_block.1.block.2.1.bias"); ok { projectionDim = ti.Dimensions[0] } case "ldpv2": - imgPatches /= 4 + nPatches /= 4 if ti, ok := gf.TensorInfos.Get("mm.model.peg.0.bias"); ok { projectionDim = ti.Dimensions[0] } @@ -805,142 +1074,208 @@ func (gf *GGUFFile) estimateLLaMACppRunInProjector(o *_GGUFRunEstimateOptions, a } case "resampler": if ti, ok := gf.TensorInfos.Get("resampler.query"); ok { - imgPatches = ti.Dimensions[1] + nPatches = ti.Dimensions[1] projectionDim = ti.Dimensions[0] } case "adapter": + nPatches /= 4 + nPatches += 2 if ti, ok := gf.TensorInfos.Get("adapter.linear.dense_4h_to_h.weight"); ok { projectionDim = ti.Dimensions[1] } - case "qwen2vl_merger": + case "qwen2vl_merger", "qwen2.5vl_merger", "qwen2.5o": nSizePatch := uint64(a.ClipVisionPatchSize * 2) - imgHeightPatchSize := imgHeightSize / nSizePatch - if imgHeightSize%nSizePatch > 0 { - imgHeightPatchSize++ + heightPatchSize := heightMaxSize / nSizePatch + if heightMaxSize%nSizePatch > 0 { + heightPatchSize++ } - imgWidthPatchSize := imgWidthSize / nSizePatch - if imgWidthSize%nSizePatch > 0 { - imgWidthPatchSize++ + widthPatchSize := widthMaxSize / nSizePatch + if widthMaxSize%nSizePatch > 0 { + widthPatchSize++ } - imgPatches = imgHeightPatchSize * imgWidthPatchSize + nPatches = heightPatchSize * widthPatchSize if ti, ok := gf.TensorInfos.Get("mm.2.bias"); ok { projectionDim = ti.Dimensions[0] } case "gemma3": + nPerSide := uint64(a.ClipVisionImageSize) / uint64(a.ClipVisionPatchSize) + nPerSide2DPool := nPerSide / uint64(a.ClipVisionProjectorScaleFactor) + nPatches = nPerSide2DPool * nPerSide2DPool if ti, ok := gf.TensorInfos.Get("mm.input_projection.weight"); ok { - imgPatches = 256 projectionDim = ti.Dimensions[0] } + case "idefics3", "llama4": + nPatches /= uint64(a.ClipVisionProjectorScaleFactor * a.ClipVisionProjectorScaleFactor) + if ti, ok := gf.TensorInfos.Get("mm.model.fc.weight"); ok { + projectionDim = ti.Dimensions[1] + } + case "pixtral": + heightPatchSize := heightMaxSize / uint64(a.ClipVisionPatchSize) + if a.ClipVisionSpatialMergeSize > 0 { + heightPatchSize /= uint64(a.ClipVisionSpatialMergeSize) + } + widthPatchSize := widthMaxSize / uint64(a.ClipVisionPatchSize) + if a.ClipVisionSpatialMergeSize > 0 { + widthPatchSize /= uint64(a.ClipVisionSpatialMergeSize) + } + nPatches = heightPatchSize*widthPatchSize + heightPatchSize - 1 /* [IMG_BREAK] per row */ + if ti, ok := gf.TensorInfos.Get("mm.2.bias"); ok { + projectionDim = ti.Dimensions[0] + } + case "internvl": + nPatches /= uint64(a.ClipVisionProjectorScaleFactor * a.ClipVisionProjectorScaleFactor) + if ti, ok := gf.TensorInfos.Get("mm.model.mlp.3.weight"); ok { + projectionDim = ti.Dimensions[1] + } } - } - - // Footprint. - { - // Bootstrap. - e.Devices[0].Footprint = GGUFBytesScalar(5*1024*1024) /* model load */ + (gf.Size - gf.ModelSize) /* metadata */ - // Image Embed, - // see https://github.com/ggerganov/llama.cpp/blob/0827b2c1da299805288abbd556d869318f2b121e/examples/llava/llava.cpp#L401-L407. - e.Devices[0].Footprint += GGUFBytesScalar(imgPatchesMaxSize * imgPatches * projectionDim * 4 /* float32 size */) - } + // Footprint + { + // Image Embed, + // see https://github.com/ggerganov/llama.cpp/blob/0827b2c1da299805288abbd556d869318f2b121e/examples/llava/llava.cpp#L401-L407. + e.Devices[0].Footprint += GGUFBytesScalar(patchesMaxSize * nPatches * projectionDim * 4 /* float32 size */) + } - idx := 0 // Default to the main host's RAM. - if *o.LMCOffloadLayers != 0 { - for i := 1; i < len(e.Devices); i++ { - if !e.Devices[i].Remote { - idx = i - break + // Computation. + { + // See https://github.com/ggml-org/llama.cpp/blob/ec9e0301fef6476df83e94842c3b625501c95566/tools/mtmd/clip.cpp#L374. + var maxNodes uint64 = 8192 + + // Bootstrap, compute metadata. + cm := GGMLTensorOverhead()*maxNodes + GGMLComputationGraphOverhead(maxNodes, false) + e.Devices[0].Computation.Footprint += GGUFBytesScalar(cm) + + // Scheduler overhead, + // see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L16149. + e.Devices[0].Computation.Footprint += GGUFBytesScalar(4 * 1024 * 1024) + + // GGML context, + // see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L5015-L5036. + gc := 2 /* buffer count */ * GGMLTensorOverhead() * (uint64(len(gf.TensorInfos)) + 1 + a.ClipVisionBlockCount*3) + e.Devices[0].Computation.Footprint += GGUFBytesScalar(gc) + + // Tensor usage. + var ( + hasClassEmbd bool + nPositions uint64 + nBatch uint64 + nEmbd uint64 + nHead uint64 + ) + { + _, hasClassEmbd = ipLs.Get("v.class_embd") + nPositions = nPatches + if hasClassEmbd { + nPositions += 1 + } + if a.ClipHasQwen2VLMerger || + a.ClipProjectorType == "qwen2vl_merger" || + a.ClipProjectorType == "qwen2.5vl_merger" || + a.ClipProjectorType == "qwen2.5o" { + nPositions *= 4 + } + nBatch = 1 + nEmbd = a.ClipVisionEmbeddingLength + nHead = a.ClipVisionAttentionHeadCount + } + // First, get the usage of input layer. + { + var ( + inpRaw = GGMLTypeF32.RowSizeOf([]uint64{widthMaxSize, heightMaxSize, 3, nBatch}) // F32 [img_width, img_height, 3, n_batch] + inpRawCnt = GGMLTypeF32.RowSizeOf([]uint64{nPatches, nEmbd, nBatch}) // I32 [n_patches, n_embd, n_batch] + inpEmbd = GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions, nBatch}) // F32 [n_embd, n_positions, n_batch] + inpPosEmbd = GGMLTypeF32.RowSizeOf([]uint64{projectionDim, nPatches, nBatch}) // F32 [mmproj, n_patches, n_batch] + inpPos = GGMLTypeI32.RowSizeOf([]uint64{nPositions}) // I32 [n_positions] + inpPatches = GGMLTypeI32.RowSizeOf([]uint64{nPatches}) // I32 [n_patches] + ) + e.Devices[idx].Computation.Input += GGUFBytesScalar(inpRaw + inpRawCnt + inpPos + inpPatches) + if a.ClipHasMiniCPMVProjector || + a.ClipProjectorType == "resampler" { + e.Devices[idx].Computation.Input += GGUFBytesScalar(inpPosEmbd) + } + if hasClassEmbd { + e.Devices[idx].Computation.Input += GGUFBytesScalar(inpEmbd) + } + if a.ClipVisionWindowAttentionPattern > 0 { // Qwen2.5 VL + inpWindowIndex := GGMLTypeI32.RowSizeOf([]uint64{nPatches}) // I32 [n_patches] + inpWindowMask := GGMLTypeI32.RowSizeOf([]uint64{nPositions, nPositions}) // I32 [n_positions, n_positions] + e.Devices[idx].Computation.Input += GGUFBytesScalar(inpWindowIndex + inpWindowMask) + } + } + // Since the steps between transformer layers are serial, + // the allocated memory can be reused for the next layer. + // So, we only consider the usage of a certain layer. + { + compNorm := GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions}) * 2 + compVcur := GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions}) + compKcur := GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions}) + compKQcur := GGMLTypeF32.RowSizeOf([]uint64{nPositions, nPositions, nHead}) + e.Devices[idx].Computation.Compute += GGUFBytesScalar(compNorm + compVcur + compKcur + compKQcur) } } } - // Weight & Parameter. - { - // Compute. - e.Devices[idx].HandleLayers = *o.LMCOffloadLayers - e.Devices[idx].HandleLastLayer = int(e.Devices[idx].HandleLayers - 1) - e.Devices[idx].Weight.Compute = GGUFBytesScalar(tfLs.Bytes()) - e.Devices[idx].Parameter.Compute = GGUFParametersScalar(tfLs.Elements()) - - // IO. - e.Devices[idx].Weight.Input = GGUFBytesScalar(ipLs.Bytes()) - e.Devices[idx].Parameter.Input = GGUFParametersScalar(ipLs.Elements()) - e.Devices[idx].Weight.Output = GGUFBytesScalar(opLs.Bytes()) - e.Devices[idx].Parameter.Output = GGUFParametersScalar(opLs.Elements()) - } - - // Computation. - { - // Bootstrap, compute metadata, - // see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L16135-L16136. - cm := GGMLTensorOverhead()*GGMLComputationGraphNodesMaximum + - GGMLComputationGraphOverhead(GGMLComputationGraphNodesMaximum, false) - e.Devices[0].Computation.Footprint = GGUFBytesScalar(cm) - - // Scheduler overhead, - // see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L16149. - e.Devices[0].Computation.Footprint += GGUFBytesScalar(4 * 1024 * 1024) - - // GGML context, - // see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L5015-L5036. - gc := 2 /* buffer count */ * GGMLTensorOverhead() * (uint64(len(gf.TensorInfos)) + 1 + a.BlockCount*3) - e.Devices[0].Computation.Footprint += GGUFBytesScalar(gc) - - // Tensor usage. - var ( - hasClassEmbd bool - nPositions uint64 - nPositionIDs uint64 - nBatch uint64 - nEmbd uint64 - nHead uint64 - ) + if a.ClipHasAudioEncoder { + // See https://github.com/ggml-org/llama.cpp/blob/6385b843a8dc8e15b8362196039720c58dd79fa2/tools/mtmd/mtmd-audio.cpp#L311. + var projectionDim uint64 // NB(thxCode): do not sure if there is the correct name. { - _, hasClassEmbd = ipLs.Get("v.class_embd") - nPositions = nPatches - if hasClassEmbd { - nPositions += 1 - } - nPositionIDs = nPositions - if a.ClipHasQwen2VLMerger { - nPositionIDs *= 4 + if ti, ok := gf.TensorInfos.Get("a.position_embd.weight"); ok { + projectionDim = ti.Dimensions[1] } - nBatch = 1 - nEmbd = a.EmbeddingLength - nHead = a.AttentionHeadCount } - // First, get the usage of input layer. - var ( - inpRaw = GGMLTypeF32.RowSizeOf([]uint64{imgWidthSize, imgHeightSize, 3, nBatch}) // F32 [img_width, img_height, 3, n_batch] - inpRawCnt = GGMLTypeF32.RowSizeOf([]uint64{nPatches, nEmbd, nBatch}) // I32 [n_patches, n_embd, n_batch] - inpEmbd = GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions, nBatch}) // F32 [n_embd, n_positions, n_batch] - inpPosEmbd = GGMLTypeF32.RowSizeOf([]uint64{projectionDim, nPatchesHeight * nPatchesWidth, nBatch}) // F32 [mmproj, pos_h * pos_w, n_batch] - inpPos = GGMLTypeI32.RowSizeOf([]uint64{nPositionIDs}) // I32 [n_positions] - inpPatches = GGMLTypeI32.RowSizeOf([]uint64{nPatches}) // I32 [n_patches] - ) + + // Computation. { - e.Devices[idx].Computation.Input = GGUFBytesScalar(inpRaw + inpRawCnt + inpPos + inpPatches) - if a.ClipHasMiniCPMVProjector { - e.Devices[idx].Computation.Input += GGUFBytesScalar(inpPosEmbd) + // See https://github.com/ggml-org/llama.cpp/blob/ec9e0301fef6476df83e94842c3b625501c95566/tools/mtmd/clip.cpp#L374. + var maxNodes uint64 = 8192 + + // Bootstrap, compute metadata. + cm := GGMLTensorOverhead()*maxNodes + GGMLComputationGraphOverhead(maxNodes, false) + e.Devices[0].Computation.Footprint += GGUFBytesScalar(cm) + + // Scheduler overhead, + // see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L16149. + e.Devices[0].Computation.Footprint += GGUFBytesScalar(4 * 1024 * 1024) + + // GGML context, + // see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L5015-L5036. + gc := 2 /* buffer count */ * GGMLTensorOverhead() * (uint64(len(gf.TensorInfos)) + 1 + a.ClipAudioBlockCount*3) + e.Devices[0].Computation.Footprint += GGUFBytesScalar(gc) + + // Tensor usage. + var ( + nPositions uint64 + nBatch uint64 + nEmbd uint64 + nHead uint64 + ) + { + nPositions = projectionDim + nBatch = 1 + nEmbd = a.ClipAudioEmbeddingLength + nHead = a.ClipAudioAttentionHeadCount } - if hasClassEmbd { + // First, get the usage of input layer. + { + inpEmbd := GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions, nBatch}) // F32 [n_embed, n_positions, n_batch] e.Devices[idx].Computation.Input += GGUFBytesScalar(inpEmbd) } - } - // Since the steps between transformer layers are serial, - // the allocated memory can be reused for the next layer. - // So, we only consider the usage of a certain layer. - { - compNorm := GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions}) * 2 - compVcur := GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions}) - compKcur := GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions}) - compKQcur := GGMLTypeF32.RowSizeOf([]uint64{nPositions, nPositions, nHead}) - e.Devices[idx].Computation.Compute = GGUFBytesScalar(compNorm + compVcur + compKcur + compKQcur) + // Since the steps between transformer layers are serial, + // the allocated memory can be reused for the next layer. + // So, we only consider the usage of a certain layer. + { + compNorm := GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions}) + compVcur := GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions}) + compKcur := GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions}) + compKQcur := GGMLTypeF32.RowSizeOf([]uint64{nPositions, nPositions, nHead}) + e.Devices[idx].Computation.Compute += GGUFBytesScalar(compNorm + compVcur + compKcur + compKQcur) + } } } } -func (gf *GGUFFile) estimateLLaMaCppRunInAdapter(o *_GGUFRunEstimateOptions, a *GGUFArchitecture, e *LLaMACppRunEstimate) { +// estimateLLaMACppRunInAdapter estimates the usages of the GGUF file for adapter. +func (gf *GGUFFile) estimateLLaMACppRunInAdapter(o *_GGUFRunEstimateOptions, a *GGUFArchitecture, e *LLaMACppRunEstimate) { ls := gf.Layers() ioLs, tfLs, _ := ls.Cut([]string{ "position_*", @@ -1048,7 +1383,7 @@ func (gf *GGUFFile) estimateLLaMaCppRunInAdapter(o *_GGUFRunEstimateOptions, a * if _, ok := opLs.Get("output.weight"); ok { wg = GGUFBytesScalar(opLs.Bytes()) ps = GGUFParametersScalar(opLs.Elements()) - } else if a.AttentionCausal { + } else { wg = GGUFBytesScalar(opLs.Bytes()) + e.Devices[0].Weight.Input /* duplicate the input layer */ ps = GGUFParametersScalar(opLs.Elements() + ipLs.Elements()) } @@ -1062,6 +1397,36 @@ func (gf *GGUFFile) estimateLLaMaCppRunInAdapter(o *_GGUFRunEstimateOptions, a * } } +// estimateLLaMACppRunInIMatrix estimates the usages of the GGUF file for imatrix. +func (gf *GGUFFile) estimateLLaMACppRunInIMatrix(_ *_GGUFRunEstimateOptions, a *GGUFArchitecture, e *LLaMACppRunEstimate) { + ls := gf.Layers() + + if a.BlockCount == 0 { + a.BlockCount = uint64(len(ls)) + } + + // Distributable. + e.Distributable = false + + // Footprint. + { + // Bootstrap. + e.Devices[0].Footprint = GGUFBytesScalar(5*1024*1024) /* model load */ + (gf.Size - gf.ModelSize) /* metadata */ + } + + // Weight & Parameter. + { + var ( + wg GGUFBytesScalar + ps GGUFParametersScalar + ) + wg = GGUFBytesScalar(ls.Bytes()) + ps = GGUFParametersScalar(ls.Elements()) + e.Devices[0].Weight.Compute = wg + e.Devices[0].Parameter.Compute = ps + } +} + // Types for LLaMACpp estimated summary. type ( // LLaMACppRunEstimateSummary represents the summary of the usage for loading the GGUF file in llama.cpp. @@ -1177,6 +1542,7 @@ func (e LLaMACppRunEstimate) SummarizeItem(mmap bool, nonUMARamFootprint, nonUMA emi.RAM.UMA -= wg if !mmap { emi.RAM.UMA += e.Devices[0].Weight.Output + emi.RAM.UMA += e.Devices[0].Weight.ComputeOverridden } } @@ -1277,7 +1643,7 @@ func (e LLaMACppRunEstimate) Summarize(mmap bool, nonUMARamFootprint, nonUMAVram } func (u LLaMACppWeightMemoryUsage) Sum() GGUFBytesScalar { - return u.Input + u.Compute + u.Output + return u.Input + u.Compute + u.ComputeOverridden + u.Output } func (u LLaMACppKVCacheMemoryUsage) Sum() GGUFBytesScalar { @@ -1287,3 +1653,9 @@ func (u LLaMACppKVCacheMemoryUsage) Sum() GGUFBytesScalar { func (u LLaMACppComputationMemoryUsage) Sum() GGUFBytesScalar { return u.Footprint + u.Input + max(u.Compute, u.Output) } + +// ClipAligning returns the aligned value of x to the nearest multiple of n, +// see https://github.com/ggml-org/llama.cpp/blob/cdf94a18023c92f41808ec874ba577d914674717/tools/mtmd/clip-impl.h#L114-L115. +func ClipAligning(x, n uint64) uint64 { + return ((x + n - 1) / n) * n +} diff --git a/vendor/github.com/gpustack/gguf-parser-go/file_estimate__stablediffusioncpp.go b/vendor/github.com/gpustack/gguf-parser-go/file_estimate__stablediffusioncpp.go index dc1dd0f2..34cb5228 100644 --- a/vendor/github.com/gpustack/gguf-parser-go/file_estimate__stablediffusioncpp.go +++ b/vendor/github.com/gpustack/gguf-parser-go/file_estimate__stablediffusioncpp.go @@ -70,6 +70,7 @@ type ( } ) +// EstimateStableDiffusionCppRun estimates the usages of the GGUF file in stable-diffusion.cpp. func (gf *GGUFFile) EstimateStableDiffusionCppRun(opts ...GGUFRunEstimateOption) (e StableDiffusionCppRunEstimate) { // Options var o _GGUFRunEstimateOptions @@ -233,7 +234,7 @@ func (gf *GGUFFile) EstimateStableDiffusionCppRun(opts ...GGUFRunEstimateOption) } // Autoencoder. - if aeLs != nil { + if len(aeLs) != 0 { e.Autoencoder.Devices[aeDevIdx].Weight = GGUFBytesScalar(aeLs.Bytes()) e.Autoencoder.Devices[aeDevIdx].Parameter = GGUFParametersScalar(aeLs.Elements()) } @@ -245,10 +246,11 @@ func (gf *GGUFFile) EstimateStableDiffusionCppRun(opts ...GGUFRunEstimateOption) // Computation. { - // Bootstrap, compute metadata, - // see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L16135-L16136. - cm := GGMLTensorOverhead()*GGMLComputationGraphNodesMaximum + - GGMLComputationGraphOverhead(GGMLComputationGraphNodesMaximum, false) + // See https://github.com/leejet/stable-diffusion.cpp/blob/10c6501bd05a697e014f1bee3a84e5664290c489/ggml_extend.hpp#L1058C9-L1058C23. + var maxNodes uint64 = 32768 + + // Bootstrap, compute metadata. + cm := GGMLTensorOverhead()*maxNodes + GGMLComputationGraphOverhead(maxNodes, false) e.Devices[0].Computation = GGUFBytesScalar(cm) // Work context, @@ -350,7 +352,7 @@ func (gf *GGUFFile) EstimateStableDiffusionCppRun(opts ...GGUFRunEstimateOption) } // Decode usage. - if aeLs != nil && !*o.SDCFreeComputeMemoryImmediately { + if len(aeLs) != 0 && !*o.SDCFreeComputeMemoryImmediately { // Bootstrap. e.Autoencoder.Devices[aeDevIdx].Footprint += GGUFBytesScalar(100 * 1024 * 1024) /*100 MiB.*/ diff --git a/vendor/github.com/gpustack/gguf-parser-go/file_estimate_option.go b/vendor/github.com/gpustack/gguf-parser-go/file_estimate_option.go index 9dee4bf0..3591ad6f 100644 --- a/vendor/github.com/gpustack/gguf-parser-go/file_estimate_option.go +++ b/vendor/github.com/gpustack/gguf-parser-go/file_estimate_option.go @@ -1,7 +1,9 @@ package gguf_parser import ( + "regexp" "slices" + "strconv" "github.com/gpustack/gguf-parser-go/util/ptr" ) @@ -14,22 +16,29 @@ type ( MainGPUIndex int RPCServers []string TensorSplitFraction []float64 + OverriddenTensors []*GGUFRunOverriddenTensor DeviceMetrics []GGUFRunDeviceMetric // LLaMACpp (LMC) specific - LMCContextSize *int32 - LMCInMaxContextSize bool - LMCLogicalBatchSize *int32 - LMCPhysicalBatchSize *int32 - LMCVisualMaxImageSize *uint32 - LMCCacheKeyType *GGMLType - LMCCacheValueType *GGMLType - LMCOffloadKVCache *bool - LMCOffloadLayers *uint64 - LMCSplitMode LLaMACppSplitMode - LMCProjector *LLaMACppRunEstimate - LMCDrafter *LLaMACppRunEstimate - LMCAdapters []LLaMACppRunEstimate + LMCContextSize *int32 + LMCRoPEFrequencyBase *float32 + LMCRoPEFrequencyScale *float32 + LMCRoPEScalingType *string + LMCRoPEScalingOriginalContextSize *int32 + LMCInMaxContextSize bool + LMCLogicalBatchSize *int32 + LMCPhysicalBatchSize *int32 + LMCVisualMaxImageSize *uint32 + LMCMaxProjectedCache *uint32 + LMCCacheKeyType *GGMLType + LMCCacheValueType *GGMLType + LMCOffloadKVCache *bool + LMCOffloadLayers *uint64 + LMCSplitMode LLaMACppSplitMode + LMCFullSizeSWACache bool + LMCProjector *LLaMACppRunEstimate + LMCDrafter *LLaMACppRunEstimate + LMCAdapters []LLaMACppRunEstimate // StableDiffusionCpp (SDC) specific SDCOffloadLayers *uint64 @@ -44,6 +53,24 @@ type ( SDCControlNet *StableDiffusionCppRunEstimate } + // GGUFRunOverriddenTensor holds the overridden tensor information for the estimate. + // + // When BufferType is CPU, + // it indicates that the tensor should be loaded into the CPU memory, + // even if it belongs to a GPU offload layer. + GGUFRunOverriddenTensor struct { + // PatternRegex is the regex pattern to match the tensor name. + PatternRegex *regexp.Regexp + // BufferType is the buffer type to override, + // it can be "CPU", "CUDA0", "Metal" and others. + BufferType string + + // _BufferType record parsed buffer type, used internally. + _BufferType GGUFRunOverriddenTensorBufferType + // _Index record parsed device index, used internally. + _Index string + } + // GGUFRunDeviceMetric holds the device metric for the estimate. // // When the device represents a CPU, @@ -74,6 +101,53 @@ type ( GGUFRunEstimateOption func(*_GGUFRunEstimateOptions) ) +// GGUFRunOverriddenTensorBufferType is the type of the overridden tensor buffer. +type GGUFRunOverriddenTensorBufferType uint32 + +const ( + _ GGUFRunOverriddenTensorBufferType = iota + GGUFRunOverriddenTensorBufferTypeCPU + GGUFRunOverriddenTensorBufferTypeGPU + GGUFRunOverriddenTensorBufferTypeRPC + GGUFRunOverriddenTensorBufferTypeUnknown +) + +var ( + _GGUFRunOverriddenTensorBufferTypeCPURegex = regexp.MustCompile(`^(CPU|AMX)`) + _GGUFRunOverriddenTensorBufferTypeUMAGPURegex = regexp.MustCompile(`^(Metal|OpenCL)`) + _GGUFRunOverriddenTensorBufferTypeNonUMAGPURegex = regexp.MustCompile(`^(CUDA|CANN|ROCm|MUSA|SYCL|Vulkan|Kompute)(\d+)?`) + _GGUFRunOverriddenTensorBufferTypeRPCRegex = regexp.MustCompile(`^RPC\[(.*)\]`) +) + +// ParseBufferType returns the device index of the overridden tensor. +// +// The device index is used to determine which device the tensor belongs to, +// it is according to the buffer type description. +func (odt *GGUFRunOverriddenTensor) ParseBufferType() (GGUFRunOverriddenTensorBufferType, string) { + if odt == nil { + return GGUFRunOverriddenTensorBufferTypeUnknown, "" + } + + if odt._BufferType == 0 { + odt._BufferType = GGUFRunOverriddenTensorBufferTypeUnknown + if ms := _GGUFRunOverriddenTensorBufferTypeCPURegex.FindStringSubmatch(odt.BufferType); len(ms) > 1 { + odt._BufferType, odt._Index = GGUFRunOverriddenTensorBufferTypeCPU, "0" + } + if ms := _GGUFRunOverriddenTensorBufferTypeUMAGPURegex.FindStringSubmatch(odt.BufferType); len(ms) > 1 { + odt._BufferType, odt._Index = GGUFRunOverriddenTensorBufferTypeGPU, "1" + } + if ms := _GGUFRunOverriddenTensorBufferTypeRPCRegex.FindStringSubmatch(odt.BufferType); len(ms) > 1 { + odt._BufferType, odt._Index = GGUFRunOverriddenTensorBufferTypeRPC, ms[1] + } + if ms := _GGUFRunOverriddenTensorBufferTypeNonUMAGPURegex.FindStringSubmatch(odt.BufferType); len(ms) > 2 { + if idx, err := strconv.ParseInt(ms[2], 10, 64); err == nil && idx >= 0 { + odt._BufferType, odt._Index = GGUFRunOverriddenTensorBufferTypeGPU, ms[2] + } + } + } + return odt._BufferType, odt._Index +} + // WithParallelSize sets the (decoding sequences) parallel size for the estimate. func WithParallelSize(size int32) GGUFRunEstimateOption { return func(o *_GGUFRunEstimateOptions) { @@ -137,6 +211,24 @@ func WithTensorSplitFraction(fractions []float64) GGUFRunEstimateOption { } } +// WithOverriddenTensors sets the overridden tensors for the estimate. +func WithOverriddenTensors(tensors []GGUFRunOverriddenTensor) GGUFRunEstimateOption { + return func(o *_GGUFRunEstimateOptions) { + if len(tensors) == 0 { + return + } + for _, t := range tensors { + if t.PatternRegex == nil || t.BufferType == "" { + return + } + } + o.OverriddenTensors = make([]*GGUFRunOverriddenTensor, len(tensors)) + for i := range tensors { + o.OverriddenTensors[i] = &tensors[i] + } + } +} + // WithDeviceMetrics sets the device metrics for the estimate. func WithDeviceMetrics(metrics []GGUFRunDeviceMetric) GGUFRunEstimateOption { return func(o *_GGUFRunEstimateOptions) { @@ -157,6 +249,29 @@ func WithLLaMACppContextSize(size int32) GGUFRunEstimateOption { } } +// WithLLaMACppRoPE sets the RoPE parameters for the estimate. +func WithLLaMACppRoPE( + frequencyBase float64, + frequencyScale float64, + scalingType string, + scalingOriginalContextSize int32, +) GGUFRunEstimateOption { + return func(o *_GGUFRunEstimateOptions) { + if frequencyBase > 0 { + o.LMCRoPEFrequencyBase = ptr.Float32(float32(frequencyBase)) + } + if frequencyScale > 0 { + o.LMCRoPEFrequencyScale = ptr.Float32(float32(frequencyScale)) + } + if slices.Contains([]string{"none", "linear", "yarn"}, scalingType) { + o.LMCRoPEScalingType = &scalingType + } + if scalingOriginalContextSize > 0 { + o.LMCRoPEScalingOriginalContextSize = ptr.To(scalingOriginalContextSize) + } + } +} + // WithinLLaMACppMaxContextSize limits the context size to the maximum, // if the context size is over the maximum. func WithinLLaMACppMaxContextSize() GGUFRunEstimateOption { @@ -247,6 +362,13 @@ func WithLLaMACppSplitMode(mode LLaMACppSplitMode) GGUFRunEstimateOption { } } +// WithLLaMACppFullSizeSWACache enables full size sliding window attention cache. +func WithLLaMACppFullSizeSWACache() GGUFRunEstimateOption { + return func(o *_GGUFRunEstimateOptions) { + o.LMCFullSizeSWACache = true + } +} + // WithLLaMACppVisualMaxImageSize sets the visual maximum image size input for the estimate. func WithLLaMACppVisualMaxImageSize(size uint32) GGUFRunEstimateOption { return func(o *_GGUFRunEstimateOptions) { @@ -257,6 +379,16 @@ func WithLLaMACppVisualMaxImageSize(size uint32) GGUFRunEstimateOption { } } +// WithLLaMACppMaxProjectedCache sets the maximum projected embedding cache for the estimate. +func WithLLaMACppMaxProjectedCache(cacheSize uint32) GGUFRunEstimateOption { + return func(o *_GGUFRunEstimateOptions) { + if cacheSize == 0 { + return + } + o.LMCMaxProjectedCache = ptr.To(cacheSize) + } +} + // WithLLaMACppDrafter sets the drafter estimate usage. func WithLLaMACppDrafter(dft *LLaMACppRunEstimate) GGUFRunEstimateOption { return func(o *_GGUFRunEstimateOptions) { diff --git a/vendor/github.com/gpustack/gguf-parser-go/file_metadata.go b/vendor/github.com/gpustack/gguf-parser-go/file_metadata.go index 13e39e05..bc569e1e 100644 --- a/vendor/github.com/gpustack/gguf-parser-go/file_metadata.go +++ b/vendor/github.com/gpustack/gguf-parser-go/file_metadata.go @@ -2,6 +2,7 @@ package gguf_parser import ( "regexp" + "slices" "sort" "strings" @@ -53,6 +54,10 @@ type GGUFMetadata struct { License string `json:"license,omitempty"` // FileType describes the type of the majority of the tensors in the GGUF file. FileType GGUFFileType `json:"fileType"` + // FileTypeDescriptor describes the type of the GGUF file according to the FileType and trait layer. + // + // This supplies the FileType with more detail. + FileTypeDescriptor string `json:"fileTypeDetail"` /* Appendix */ @@ -70,52 +75,85 @@ type GGUFMetadata struct { } // GGUFFileType is a type of GGUF file, -// see https://github.com/ggerganov/llama.cpp/blob/278d0e18469aacf505be18ce790a63c7cc31be26/ggml/include/ggml.h#L404-L433. +// see https://github.com/ggml-org/llama.cpp/blob/fd1234cb468935ea087d6929b2487926c3afff4b/ggml/include/ggml.h#L419-L445, +// and https://github.com/huggingface/huggingface.js/blob/d67a464473ca07fee9811a129e5fac8cc7487098/packages/tasks/src/gguf.ts#L4-L52. type GGUFFileType uint32 // GGUFFileType constants. // // GGUFFileTypeMostlyQ4_2, GGUFFileTypeMostlyQ4_3 are deprecated. +// GGUFFileTypeMostlyQ4_0_4_4, GGUFFileTypeMostlyQ4_0_4_8, GGUFFileTypeMostlyQ4_0_8_8 are deprecated. // -// GGUFFileTypeMostlyQ4_1_F16 is a special case where the majority of the tensors are Q4_1, +// GGUFFileTypeMostlyQ4_1_SOME_F16 is a special case where the majority of the tensors are Q4_1, // but 'token_embd.weight' and 'output.weight' tensors are F16. const ( - GGUFFileTypeAllF32 GGUFFileType = iota // F32 - GGUFFileTypeMostlyF16 // F16 - GGUFFileTypeMostlyQ4_0 // Q4_0 - GGUFFileTypeMostlyQ4_1 // Q4_1 - GGUFFileTypeMostlyQ4_1_F16 // Q4_1_F16 - GGUFFileTypeMostlyQ4_2 // Q4_2 - GGUFFileTypeMostlyQ4_3 // Q4_3 - GGUFFileTypeMostlyQ8_0 // Q8_0 - GGUFFileTypeMostlyQ5_0 // Q5_0 - GGUFFileTypeMostlyQ5_1 // Q5_1 - GGUFFileTypeMostlyQ2_K // Q2_K - GGUFFileTypeMostlyQ3_K // Q3_K/Q3_K_S - GGUFFileTypeMostlyQ4_K // Q4_K/Q3_K_M - GGUFFileTypeMostlyQ5_K // Q5_K/Q3_K_L - GGUFFileTypeMostlyQ6_K // Q6_K/Q4_K_S - GGUFFileTypeMostlyIQ2_XXS // IQ2_XXS/Q4_K_M - GGUFFileTypeMostlyIQ2_XS // IQ2_XS/Q5_K_S - GGUFFileTypeMostlyIQ3_XXS // IQ3_XXS/Q5_K_M - GGUFFileTypeMostlyIQ1_S // IQ1_S/Q6_K - GGUFFileTypeMostlyIQ4_NL // IQ4_NL - GGUFFileTypeMostlyIQ3_S // IQ3_S - GGUFFileTypeMostlyIQ2_S // IQ2_S - GGUFFileTypeMostlyIQ4_XS // IQ4_XS - GGUFFileTypeMostlyIQ1_M // IQ1_M - GGUFFileTypeMostlyBF16 // BF16 - GGUFFileTypeMostlyQ4_0_4_4 // Q4_0_4x4 - GGUFFileTypeMostlyQ4_0_4_8 // Q4_0_4x8 - GGUFFileTypeMostlyQ4_0_8_8 // Q4_0_8x8 - GGUFFileTypeMostlyTQ1_0 // TQ1_0 - GGUFFileTypeMostlyTQ2_0 // TQ2_0 - GGUFFileTypeMostlyIQ4_NL_4_4 // IQ4_NL_4x4 - GGUFFileTypeMostlyIQ4_NL_4_8 // IQ4_NL_4x8 - GGUFFileTypeMostlyIQ4_NL_8_8 // IQ4_NL_8x8 - _GGUFFileTypeCount // Unknown + GGUFFileTypeMostlyF32 GGUFFileType = iota // MOSTLY_F32 + GGUFFileTypeMostlyF16 // MOSTLY_F16 + GGUFFileTypeMostlyQ4_0 // MOSTLY_Q4_0 + GGUFFileTypeMostlyQ4_1 // MOSTLY_Q4_1 + GGUFFileTypeMostlyQ4_1_SOME_F16 // MOSTLY_Q4_1_SOME_F16 + GGUFFileTypeMostlyQ4_2 // MOSTLY_Q4_2 + GGUFFileTypeMostlyQ4_3 // MOSTLY_Q4_3 + GGUFFileTypeMostlyQ8_0 // MOSTLY_Q8_0 + GGUFFileTypeMostlyQ5_0 // MOSTLY_Q5_0 + GGUFFileTypeMostlyQ5_1 // MOSTLY_Q5_1 + GGUFFileTypeMostlyQ2_K // MOSTLY_Q2_K + GGUFFileTypeMostlyQ3_K_S // MOSTLY_Q3_K_S + GGUFFileTypeMostlyQ3_K_M // MOSTLY_Q3_K_M + GGUFFileTypeMostlyQ3_K_L // MOSTLY_Q3_K_L + GGUFFileTypeMostlyQ4_K_S // MOSTLY_Q4_K_S + GGUFFileTypeMostlyQ4_K_M // MOSTLY_Q4_K_M + GGUFFileTypeMostlyQ5_K_S // MOSTLY_Q5_K_S + GGUFFileTypeMostlyQ5_K_M // MOSTLY_Q5_K_M + GGUFFileTypeMostlyQ6_K // MOSTLY_Q6_K + GGUFFileTypeMostlyIQ2_XXS // MOSTLY_IQ2_XXS + GGUFFileTypeMostlyIQ2_XS // MOSTLY_IQ2_XS + GGUFFileTypeMostlyQ2_K_S // MOSTLY_Q2_K_S + GGUFFileTypeMostlyIQ3_XS // MOSTLY_IQ3_XS + GGUFFileTypeMostlyIQ3_XXS // MOSTLY_IQ3_XXS + GGUFFileTypeMostlyIQ1_S // MOSTLY_IQ1_S + GGUFFileTypeMostlyIQ4_NL // MOSTLY_IQ4_NL + GGUFFileTypeMostlyIQ3_S // MOSTLY_IQ3_S + GGUFFileTypeMostlyIQ3_M // MOSTLY_IQ3_M + GGUFFileTypeMostlyIQ2_S // MOSTLY_IQ2_S + GGUFFileTypeMostlyIQ2_M // MOSTLY_IQ2_M + GGUFFileTypeMostlyIQ4_XS // MOSTLY_IQ4_XS + GGUFFileTypeMostlyIQ1_M // MOSTLY_IQ1_M + GGUFFileTypeMostlyBF16 // MOSTLY_BF16 + GGUFFileTypeMostlyQ4_0_4_4 // MOSTLY_Q4_0_4_4 + GGUFFileTypeMostlyQ4_0_4_8 // MOSTLY_Q4_0_4_8 + GGUFFileTypeMostlyQ4_0_8_8 // MOSTLY_Q4_0_8_8 + GGUFFileTypeMostlyTQ1_0 // MOSTLY_TQ1_0 + GGUFFileTypeMostlyTQ2_0 // MOSTLY_TQ2_0 + GGUFFileTypeMostlyMXFP4 // MOSTLY_MXFP4 + _GGUFFileTypeCount // Unknown ) +// _GGUFPotentialDiffusionArchitectures holds a list representing the potential diffusion architectures. +// +// Since we will unify all diffusion architectures to "diffusion" during processing, +// we can use this list to match the value in explicit `general.architecture`. +var _GGUFPotentialDiffusionArchitectures = []string{ + "flux", + "sd", + "sd2.5", + "sd3", + "stable-diffusion", +} + +// _GGUFPotentialDiffusionArchitectureTensorsRegexes holds a list of regexes to match the potential diffusion architecture tensors. +// +// This is used to detect if the GGUF file is a diffusion model, +// when the `general.architecture` is not set to a known diffusion architecture. +var _GGUFPotentialDiffusionArchitectureTensorsRegexes = []*regexp.Regexp{ + regexp.MustCompile(`^model\.diffusion_model\..*`), + regexp.MustCompile(`^double_blocks\..*`), + regexp.MustCompile(`^joint_blocks\..*`), + regexp.MustCompile(`^decoder\..*`), + regexp.MustCompile(`^encoder\..*`), + regexp.MustCompile(`^text_model\..*`), +} + // Metadata returns the metadata of the GGUF file. func (gf *GGUFFile) Metadata() (gm GGUFMetadata) { const ( @@ -128,13 +166,10 @@ func (gf *GGUFFile) Metadata() (gm GGUFMetadata) { urlKey = "general.url" descriptionKey = "general.description" licenseKey = "general.license" - fileTypeKey = "general.file_type" controlVectorModelHintKey = "controlvector.model_hint" ) - gm.FileType = _GGUFFileTypeCount - m, _ := gf.Header.MetadataKV.Index([]string{ typeKey, architectureKey, @@ -145,7 +180,6 @@ func (gf *GGUFFile) Metadata() (gm GGUFMetadata) { urlKey, descriptionKey, licenseKey, - fileTypeKey, controlVectorModelHintKey, }) @@ -158,17 +192,20 @@ func (gf *GGUFFile) Metadata() (gm GGUFMetadata) { } if v, ok := m[controlVectorModelHintKey]; ok { gm.Architecture = v.ValueString() - } else if v, ok = m[architectureKey]; ok { + } else if v, ok = m[architectureKey]; ok && !slices.Contains(_GGUFPotentialDiffusionArchitectures, v.ValueString()) { gm.Architecture = v.ValueString() if gm.Architecture == "clip" { gm.Type = "projector" } + } else if gm.Type == "imatrix" { + gm.Architecture = "imatrix" // Default to imatrix. } else { - if gf.TensorInfos.Match(regexp.MustCompile(`^model\.diffusion_model\..*`)) || - gf.TensorInfos.Match(regexp.MustCompile(`^double_blocks\..*`)) { - gm.Architecture = "diffusion" - } else { - gm.Architecture = "llama" + gm.Architecture = "llama" // Default to llama. + for _, re := range _GGUFPotentialDiffusionArchitectureTensorsRegexes { + if gf.TensorInfos.Match(re) { + gm.Architecture = "diffusion" + break + } } } if v, ok := m[quantizationKey]; ok { @@ -194,13 +231,7 @@ func (gf *GGUFFile) Metadata() (gm GGUFMetadata) { if v, ok := m[licenseKey]; ok { gm.License = v.ValueString() } - if v, ok := m[fileTypeKey]; ok { - gm.FileType = GGUFFileType(ValueNumeric[uint32](v)) - } - - if gm.FileType >= _GGUFFileTypeCount { - gm.FileType = gf.guessFileType(gm.Architecture) - } + gm.FileType, gm.FileTypeDescriptor = gf.extractFileType(gm.Architecture) gm.LittleEndian = gf.Header.Version < GGUFVersionV3 || gf.Header.Magic == GGUFMagicGGUFLe gm.FileSize = gf.Size @@ -216,7 +247,7 @@ func (gf *GGUFFile) Metadata() (gm GGUFMetadata) { // https://github.com/ggerganov/ggml/blob/a10a8b880c059b3b29356eb9a9f8df72f03cdb6a/src/ggml.c#L2730-L2763. func (t GGUFFileType) GGMLType() GGMLType { switch t { - case GGUFFileTypeAllF32: + case GGUFFileTypeMostlyF32: return GGMLTypeF32 case GGUFFileTypeMostlyF16: return GGMLTypeF16 @@ -224,6 +255,8 @@ func (t GGUFFileType) GGMLType() GGMLType { return GGMLTypeQ4_0 case GGUFFileTypeMostlyQ4_1: return GGMLTypeQ4_1 + case GGUFFileTypeMostlyQ4_1_SOME_F16: + return GGMLTypeQ4_1 case GGUFFileTypeMostlyQ4_2: return GGMLTypeQ4_2 case GGUFFileTypeMostlyQ4_3: @@ -236,11 +269,19 @@ func (t GGUFFileType) GGMLType() GGMLType { return GGMLTypeQ5_1 case GGUFFileTypeMostlyQ2_K: return GGMLTypeQ2_K - case GGUFFileTypeMostlyQ3_K: + case GGUFFileTypeMostlyQ3_K_S: return GGMLTypeQ3_K - case GGUFFileTypeMostlyQ4_K: + case GGUFFileTypeMostlyQ3_K_M: + return GGMLTypeQ4_K + case GGUFFileTypeMostlyQ3_K_L: + return GGMLTypeQ5_K + case GGUFFileTypeMostlyQ4_K_S: + return GGMLTypeQ6_K + case GGUFFileTypeMostlyQ4_K_M: return GGMLTypeQ4_K - case GGUFFileTypeMostlyQ5_K: + case GGUFFileTypeMostlyQ5_K_S: + return GGMLTypeQ5_K + case GGUFFileTypeMostlyQ5_K_M: return GGMLTypeQ5_K case GGUFFileTypeMostlyQ6_K: return GGMLTypeQ6_K @@ -248,6 +289,10 @@ func (t GGUFFileType) GGMLType() GGMLType { return GGMLTypeIQ2_XXS case GGUFFileTypeMostlyIQ2_XS: return GGMLTypeIQ2_XS + case GGUFFileTypeMostlyQ2_K_S: + return GGMLTypeQ2_K + case GGUFFileTypeMostlyIQ3_XS: + return GGMLTypeIQ3_S case GGUFFileTypeMostlyIQ3_XXS: return GGMLTypeIQ3_XXS case GGUFFileTypeMostlyIQ1_S: @@ -256,7 +301,11 @@ func (t GGUFFileType) GGMLType() GGMLType { return GGMLTypeIQ4_NL case GGUFFileTypeMostlyIQ3_S: return GGMLTypeIQ3_S + case GGUFFileTypeMostlyIQ3_M: + return GGMLTypeIQ3_S case GGUFFileTypeMostlyIQ2_S: + return GGMLTypeIQ2_XS + case GGUFFileTypeMostlyIQ2_M: return GGMLTypeIQ2_S case GGUFFileTypeMostlyIQ4_XS: return GGMLTypeIQ4_XS @@ -274,39 +323,122 @@ func (t GGUFFileType) GGMLType() GGMLType { return GGMLTypeTQ1_0 case GGUFFileTypeMostlyTQ2_0: return GGMLTypeTQ2_0 - case GGUFFileTypeMostlyIQ4_NL_4_4: - return GGMLTypeIQ4_NL_4_4 - case GGUFFileTypeMostlyIQ4_NL_4_8: - return GGMLTypeIQ4_NL_4_8 - case GGUFFileTypeMostlyIQ4_NL_8_8: - return GGMLTypeIQ4_NL_8_8 + case GGUFFileTypeMostlyMXFP4: + return GGMLTypeMXFP4 default: } return _GGMLTypeCount } -// guessFileType guesses the GGUF file type by -// statistically analyzing the tensor types, -// which is inspired by -// https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML#provided-files. -func (gf *GGUFFile) guessFileType(arch string) GGUFFileType { - if len(gf.TensorInfos) == 0 { - return _GGUFFileTypeCount +// extractFileType extracts the GGUF file type from the metadata, +// it tries to return the descriptor of the file type. +func (gf *GGUFFile) extractFileType(arch string) (fileType GGUFFileType, fileTypeDescriptor string) { + fileType, fileTypeDescriptor = _GGUFFileTypeCount, "Unknown" + + const fileTypeKey = "general.file_type" + m, _ := gf.Header.MetadataKV.Index([]string{ + fileTypeKey, + }) + if v, ok := m[fileTypeKey]; ok { + fileType = GGUFFileType(ValueNumeric[uint32](v)) + } + + if fileType == _GGUFFileTypeCount { + // Guess. + if len(gf.TensorInfos) != 0 { + cm := make(map[GGMLType]int) + for i := range gf.TensorInfos { + switch { + case arch != "diffusion" && + !strings.HasPrefix(gf.TensorInfos[i].Name, "token_embd") && + !strings.HasPrefix(gf.TensorInfos[i].Name, "blk.") && + !strings.Contains(gf.TensorInfos[i].Name, "_norm") && + !strings.HasSuffix(gf.TensorInfos[i].Name, ".weight"): + continue + case arch == "diffusion" && + !strings.HasSuffix(gf.TensorInfos[i].Name, ".weight"): + continue + } + cm[gf.TensorInfos[i].Type]++ + } + fileType = GetFileType(cm) + } } + if fileType == _GGUFFileTypeCount { + return fileType, fileTypeDescriptor + } + + fileTypeDescriptor = strings.TrimPrefix(fileType.String(), "MOSTLY_") - // Count. - cm := make(map[GGMLType]int) - for i := range gf.TensorInfos { - switch { - case arch != "diffusion" && !strings.HasPrefix(gf.TensorInfos[i].Name, "blk."): - continue - case arch == "diffusion" && !strings.HasSuffix(gf.TensorInfos[i].Name, ".weight"): - continue + const tokenEmbedWeightTensorName = "token_embd.weight" + + switch fileType { + case GGUFFileTypeMostlyQ4_0: + tis, _ := gf.TensorInfos.Index([]string{tokenEmbedWeightTensorName}) + if v, ok := tis[tokenEmbedWeightTensorName]; ok { + if v.Type == GGMLTypeQ8_0 || v.Type == GGMLTypeQ5_0 || v.Type == GGMLTypeQ5_1 { + fileTypeDescriptor = "Q4_0_L" + } + } + case GGUFFileTypeMostlyQ4_1: + tis, _ := gf.TensorInfos.Index([]string{tokenEmbedWeightTensorName}) + if v, ok := tis[tokenEmbedWeightTensorName]; ok { + if v.Type == GGMLTypeQ8_0 || v.Type == GGMLTypeQ5_0 || v.Type == GGMLTypeQ5_1 { + fileTypeDescriptor = "Q4_1_L" + } + } + case GGUFFileTypeMostlyQ5_0: + tis, _ := gf.TensorInfos.Index([]string{tokenEmbedWeightTensorName}) + if v, ok := tis[tokenEmbedWeightTensorName]; ok { + if v.Type == GGMLTypeQ8_0 { + fileTypeDescriptor = "Q5_0_L" + } + } + case GGUFFileTypeMostlyQ5_1: + tis, _ := gf.TensorInfos.Index([]string{tokenEmbedWeightTensorName}) + if v, ok := tis[tokenEmbedWeightTensorName]; ok { + if v.Type == GGMLTypeQ8_0 { + fileTypeDescriptor = "Q5_1_L" + } + } + case GGUFFileTypeMostlyQ2_K: + tis, _ := gf.TensorInfos.Index([]string{tokenEmbedWeightTensorName}) + if v, ok := tis[tokenEmbedWeightTensorName]; ok { + if v.Type == GGMLTypeQ8_0 || v.Type == GGMLTypeQ4_K { + fileTypeDescriptor = "Q2_K_L" + } + } + case GGUFFileTypeMostlyQ3_K_M: + tis, _ := gf.TensorInfos.Index([]string{tokenEmbedWeightTensorName}) + if v, ok := tis[tokenEmbedWeightTensorName]; ok { + if v.Type == GGMLTypeQ8_0 { + fileTypeDescriptor = "Q3_K_L" + } + } + case GGUFFileTypeMostlyQ4_K_M: + tis, _ := gf.TensorInfos.Index([]string{tokenEmbedWeightTensorName}) + if v, ok := tis[tokenEmbedWeightTensorName]; ok { + if v.Type == GGMLTypeQ8_0 { + fileTypeDescriptor = "Q4_K_L" + } + } + case GGUFFileTypeMostlyQ5_K_M: + tis, _ := gf.TensorInfos.Index([]string{tokenEmbedWeightTensorName}) + if v, ok := tis[tokenEmbedWeightTensorName]; ok { + if v.Type == GGMLTypeQ8_0 { + fileTypeDescriptor = "Q5_K_L" + } + } + case GGUFFileTypeMostlyQ6_K: + tis, _ := gf.TensorInfos.Index([]string{tokenEmbedWeightTensorName}) + if v, ok := tis[tokenEmbedWeightTensorName]; ok { + if v.Type == GGMLTypeQ8_0 { + fileTypeDescriptor = "Q6_K_L" + } } - cm[gf.TensorInfos[i].Type]++ } - return GetFileType(cm) + return fileType, fileTypeDescriptor } // GetFileType returns the GGUFFileType represented the mostly GGMLType of the given tensors counter. @@ -326,7 +458,7 @@ func GetFileType(cm map[GGMLType]int) GGUFFileType { // Guess. if ts[0] == GGMLTypeF32 { if len(ts) == 1 { - return GGUFFileTypeAllF32 + return GGUFFileTypeMostlyF32 } ts[0] = ts[1] } @@ -348,42 +480,54 @@ func GetFileType(cm map[GGMLType]int) GGUFFileType { case GGMLTypeQ8_0: return GGUFFileTypeMostlyQ8_0 case GGMLTypeQ2_K: + if ts[len(ts)-1] == GGMLTypeQ5_K { + return GGUFFileTypeMostlyQ2_K_S + } return GGUFFileTypeMostlyQ2_K case GGMLTypeQ3_K: - switch ts[1] { - case GGMLTypeQ4_K: // Legacy, Q3_K_M. - return GGUFFileTypeMostlyQ4_K - case GGMLTypeQ5_K: // Legacy, Q3_K_L. - return GGUFFileTypeMostlyQ5_K - default: // Legacy. Q3_K_S - return GGUFFileTypeMostlyQ3_K + if cm[GGMLTypeQ8_0] > 0 || + (cm[GGMLTypeQ5_K] > 1 && cm[GGMLTypeQ4_K] == 0) { + return GGUFFileTypeMostlyQ3_K_L + } + if cm[GGMLTypeQ4_K] > 1 { + return GGUFFileTypeMostlyQ3_K_M } + return GGUFFileTypeMostlyQ3_K_S case GGMLTypeQ4_K: - if len(ts) > 2 && ts[2] == GGMLTypeQ6_K { // Legacy, Q4_K_M. - return GGUFFileTypeMostlyIQ2_XXS + if cm[GGMLTypeQ6_K] > 1 { + return GGUFFileTypeMostlyQ4_K_M } - return GGUFFileTypeMostlyQ6_K // Legacy. Q4_K_S + if cm[GGMLTypeQ3_K] > 1 { + return GGUFFileTypeMostlyQ3_K_M + } + return GGUFFileTypeMostlyQ4_K_S case GGMLTypeQ5_K: - if len(ts) > 2 && ts[2] == GGMLTypeQ6_K { // Legacy, Q5_K_M. - return GGUFFileTypeMostlyIQ3_XXS + if cm[GGMLTypeQ6_K] > 1 { + return GGUFFileTypeMostlyQ5_K_M } - return GGUFFileTypeMostlyIQ2_XS // Legacy. Q5_K_S + return GGUFFileTypeMostlyQ5_K_S case GGMLTypeQ6_K: - return GGUFFileTypeMostlyIQ1_S // Legacy. Q6_K + return GGUFFileTypeMostlyQ6_K case GGMLTypeIQ2_XXS: return GGUFFileTypeMostlyIQ2_XXS case GGMLTypeIQ2_XS: + if cm[GGMLTypeIQ4_XS] > 1 { + return GGUFFileTypeMostlyIQ2_S + } return GGUFFileTypeMostlyIQ2_XS + case GGMLTypeIQ2_S: + return GGUFFileTypeMostlyIQ2_M case GGMLTypeIQ3_XXS: return GGUFFileTypeMostlyIQ3_XXS + case GGMLTypeIQ3_S: + if cm[GGMLTypeIQ3_XXS] > 1 { + return GGUFFileTypeMostlyIQ3_XS + } + return GGUFFileTypeMostlyIQ3_S case GGMLTypeIQ1_S: return GGUFFileTypeMostlyIQ1_S case GGMLTypeIQ4_NL: return GGUFFileTypeMostlyIQ4_NL - case GGMLTypeIQ3_S: - return GGUFFileTypeMostlyIQ3_S - case GGMLTypeIQ2_S: - return GGUFFileTypeMostlyIQ2_S case GGMLTypeIQ4_XS: return GGUFFileTypeMostlyIQ4_XS case GGMLTypeIQ1_M: @@ -400,12 +544,8 @@ func GetFileType(cm map[GGMLType]int) GGUFFileType { return GGUFFileTypeMostlyTQ1_0 case GGMLTypeTQ2_0: return GGUFFileTypeMostlyTQ2_0 - case GGMLTypeIQ4_NL_4_4: - return GGUFFileTypeMostlyIQ4_NL_4_4 - case GGMLTypeIQ4_NL_4_8: - return GGUFFileTypeMostlyIQ4_NL_4_8 - case GGMLTypeIQ4_NL_8_8: - return GGUFFileTypeMostlyIQ4_NL_8_8 + case GGMLTypeMXFP4: + return GGUFFileTypeMostlyMXFP4 default: } return _GGUFFileTypeCount diff --git a/vendor/github.com/gpustack/gguf-parser-go/ggml.go b/vendor/github.com/gpustack/gguf-parser-go/ggml.go index 07146935..7e17d376 100644 --- a/vendor/github.com/gpustack/gguf-parser-go/ggml.go +++ b/vendor/github.com/gpustack/gguf-parser-go/ggml.go @@ -9,11 +9,11 @@ import ( // Types for GGMLType. type ( // GGMLType is a type of GGML tensor, - // see https://github.com/ggerganov/llama.cpp/blob/b34e02348064c2f0cef1f89b44d9bee4eb15b9e7/ggml/include/ggml.h#L363-L401. + // see https://github.com/ggml-org/llama.cpp/blob/fd1234cb468935ea087d6929b2487926c3afff4b/ggml/include/ggml.h#L368-L410. GGMLType uint32 // GGMLTypeTrait holds the trait of a GGMLType, - // see https://github.com/ggerganov/llama.cpp/blob/b34e02348064c2f0cef1f89b44d9bee4eb15b9e7/ggml/src/ggml.c#L663-L1082. + // see https://github.com/ggml-org/llama.cpp/blob/fd1234cb468935ea087d6929b2487926c3afff4b/ggml/src/ggml.c#L586-L876. GGMLTypeTrait struct { BlockSize uint64 // Original is int, in order to reduce conversion, here we use uint64. TypeSize uint64 // Original is uint32, in order to reduce conversion, here we use uint64. @@ -24,6 +24,8 @@ type ( // GGMLType constants. // // GGMLTypeQ4_2, GGMLTypeQ4_3 are deprecated. +// GGMLTypeQ4_0_4_4, GGMLTypeQ4_0_4_8, GGMLTypeQ4_0_8_8 are deprecated. +// GGMLTypeIQ4_NL_4_4, GGMLTypeIQ4_NL_4_8, GGMLTypeIQ4_NL_8_8 are deprecated. const ( GGMLTypeF32 GGMLType = iota GGMLTypeF16 @@ -64,6 +66,7 @@ const ( GGMLTypeIQ4_NL_4_4 GGMLTypeIQ4_NL_4_8 GGMLTypeIQ4_NL_8_8 + GGMLTypeMXFP4 _GGMLTypeCount // Unknown ) @@ -108,6 +111,7 @@ var _GGMLTypeTraits = map[GGMLType]GGMLTypeTrait{ GGMLTypeIQ4_NL_4_4: {BlockSize: 32, TypeSize: 18, Quantized: true}, GGMLTypeIQ4_NL_4_8: {BlockSize: 32, TypeSize: 18, Quantized: true}, GGMLTypeIQ4_NL_8_8: {BlockSize: 32, TypeSize: 18, Quantized: true}, + GGMLTypeMXFP4: {BlockSize: 32, TypeSize: 17, Quantized: true}, } // Trait returns the GGMLTypeTrait of the GGMLType. @@ -186,26 +190,28 @@ const ( // GGMLComputationGraphSize is the size of GGML computation graph in bytes. GGMLComputationGraphSize = 80 - // GGMLComputationGraphNodesMaximum is the maximum nodes of the computation graph, - // see https://github.com/ggerganov/llama.cpp/blob/7672adeec7a79ea271058c63106c142ba84f951a/llama.cpp#L103. - GGMLComputationGraphNodesMaximum = 8192 - - // GGMLComputationGraphNodesDefault is the default nodes of the computation graph, - // see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/include/ggml/ggml.h#L237. - GGMLComputationGraphNodesDefault = 2048 + // GGMLComputationBitsetSize is the size of GGML computation bitset in bytes, + // see https://github.com/ggml-org/llama.cpp/blob/master/ggml/src/ggml-impl.h#L165. + GGMLComputationBitsetSize = 4 ) // GGMLComputationGraphOverhead is the overhead of GGML graph in bytes, -// see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/src/ggml.c#L18905-L18917. +// see https://github.com/ggml-org/ggml/blob/5592ffda9c417c3c12232c828247c23d17004c88/src/ggml.c#L5941-L5956. func GGMLComputationGraphOverhead(nodes uint64, grads bool) uint64 { - const pointerSize = 8 + const ps = 8 // c++ pointer size + + hs := GGMLHashSize(nodes * 2) - var g uint64 = GGMLComputationGraphSize - g += pointerSize * nodes * 2 + var g uint64 = GGMLComputationGraphSize // graph + g += GGMLPadding(nodes*ps, ps) // nodes + g += GGMLPadding(nodes*ps, ps) // leafs + g += GGMLPadding(nodes*ps, ps) // parents + g += GGMLPadding(hs*ps, ps) // hash keys if grads { - g += pointerSize * nodes + g += GGMLPadding(hs*ps, ps) // grads + g += GGMLPadding(hs*ps, ps) // grad_accs } - g += pointerSize * GGMLHashSize(nodes) + g += GGMLPadding(GGMLBitsetSize(hs)*GGMLComputationBitsetSize, GGMLComputationBitsetSize) // bitset return GGMLObjectSize + GGMLMemoryPadding(g) } @@ -231,3 +237,9 @@ func GGMLHashSize(base uint64) uint64 { } return primes[i] } + +// GGMLBitsetSize returns the size of the bitset for the given number of bits, +// see https://github.com/ggml-org/llama.cpp/blob/ec9e0301fef6476df83e94842c3b625501c95566/ggml/src/ggml-impl.h#L166-L171. +func GGMLBitsetSize(n uint64) uint64 { + return (n + (GGMLComputationBitsetSize*8 - 1)) >> 5 +} diff --git a/vendor/github.com/gpustack/gguf-parser-go/ollama_registry_authenticate.go b/vendor/github.com/gpustack/gguf-parser-go/ollama_registry_authenticate.go index 45c4cb81..e2672b0b 100644 --- a/vendor/github.com/gpustack/gguf-parser-go/ollama_registry_authenticate.go +++ b/vendor/github.com/gpustack/gguf-parser-go/ollama_registry_authenticate.go @@ -36,7 +36,7 @@ const ( // since llama3.1, the user agent is required to be set, // otherwise the request will be rejected by 412. func OllamaUserAgent() string { - return fmt.Sprintf("ollama/0.3.3 (%s %s) Go/%s", runtime.GOARCH, runtime.GOOS, runtime.Version()) + return fmt.Sprintf("ollama/9.9.9 (%s %s) Go/%s", runtime.GOARCH, runtime.GOOS, runtime.Version()) } // OllamaRegistryAuthorizeRetry returns true if the request should be retried with authorization. diff --git a/vendor/github.com/gpustack/gguf-parser-go/util/httpx/resolver.go b/vendor/github.com/gpustack/gguf-parser-go/util/httpx/resolver.go index 42b10c3a..b1deb782 100644 --- a/vendor/github.com/gpustack/gguf-parser-go/util/httpx/resolver.go +++ b/vendor/github.com/gpustack/gguf-parser-go/util/httpx/resolver.go @@ -3,45 +3,30 @@ package httpx import ( "context" "net" - "time" - - "github.com/rs/dnscache" ) -// DefaultResolver is the default DNS resolver used by the package, -// which caches DNS lookups in memory. -var DefaultResolver = &dnscache.Resolver{ - // NB(thxCode): usually, a high latency DNS is about 3s, - // so we set the timeout to 5s here. - Timeout: 5 * time.Second, - Resolver: net.DefaultResolver, -} - -func init() { - go func() { - t := time.NewTimer(5 * time.Minute) - defer t.Stop() - for range t.C { - DefaultResolver.RefreshWithOptions(dnscache.ResolverRefreshOptions{ - ClearUnused: true, - PersistOnFailure: false, - }) - } - }() -} - func DNSCacheDialContext(dialer *net.Dialer) func(context.Context, string, string) (net.Conn, error) { + cs := map[string][]net.IP{} + return func(ctx context.Context, nw, addr string) (conn net.Conn, err error) { h, p, err := net.SplitHostPort(addr) if err != nil { return nil, err } - ips, err := DefaultResolver.LookupHost(ctx, h) - if err != nil { - return nil, err + ips, ok := cs[h] + if !ok { + ips, err = net.DefaultResolver.LookupIP(ctx, "ip4", h) + if len(ips) == 0 { + ips, err = net.DefaultResolver.LookupIP(ctx, "ip", h) + } + if err != nil { + return nil, err + } + cs[h] = ips } + // Try to connect to each IP address in order. for _, ip := range ips { - conn, err = dialer.DialContext(ctx, nw, net.JoinHostPort(ip, p)) + conn, err = dialer.DialContext(ctx, nw, net.JoinHostPort(ip.String(), p)) if err == nil { break } diff --git a/vendor/github.com/gpustack/gguf-parser-go/util/osx/file_mmap_windows.go b/vendor/github.com/gpustack/gguf-parser-go/util/osx/file_mmap_windows.go index b9879fc0..f7a09caa 100644 --- a/vendor/github.com/gpustack/gguf-parser-go/util/osx/file_mmap_windows.go +++ b/vendor/github.com/gpustack/gguf-parser-go/util/osx/file_mmap_windows.go @@ -22,7 +22,7 @@ func mmap(f *os.File, size int) ([]byte, error) { return nil, os.NewSyscallError("CloseHandle", err) } - return (*[maxMapSize]byte)(unsafe.Pointer(uintptr(addr)))[:size], nil + return (*[maxMapSize]byte)(unsafe.Pointer(addr))[:size], nil } func munmap(b []byte) error { diff --git a/vendor/github.com/gpustack/gguf-parser-go/zz_generated.ggmltype.stringer.go b/vendor/github.com/gpustack/gguf-parser-go/zz_generated.ggmltype.stringer.go index 3eaad12f..94541571 100644 --- a/vendor/github.com/gpustack/gguf-parser-go/zz_generated.ggmltype.stringer.go +++ b/vendor/github.com/gpustack/gguf-parser-go/zz_generated.ggmltype.stringer.go @@ -47,12 +47,13 @@ func _() { _ = x[GGMLTypeIQ4_NL_4_4-36] _ = x[GGMLTypeIQ4_NL_4_8-37] _ = x[GGMLTypeIQ4_NL_8_8-38] - _ = x[_GGMLTypeCount-39] + _ = x[GGMLTypeMXFP4-39] + _ = x[_GGMLTypeCount-40] } -const _GGMLType_name = "F32F16Q4_0Q4_1Q4_2Q4_3Q5_0Q5_1Q8_0Q8_1Q2_KQ3_KQ4_KQ5_KQ6_KQ8_KIQ2_XXSIQ2_XSIQ3_XXSIQ1_SIQ4_NLIQ3_SIQ2_SIQ4_XSI8I16I32I64F64IQ1_MBF16Q4_0_4_4Q4_0_4_8Q4_0_8_8TQ1_0TQ2_0IQ4_NL_4_4IQ4_NL_4_8IQ4_NL_8_8Unknown" +const _GGMLType_name = "F32F16Q4_0Q4_1Q4_2Q4_3Q5_0Q5_1Q8_0Q8_1Q2_KQ3_KQ4_KQ5_KQ6_KQ8_KIQ2_XXSIQ2_XSIQ3_XXSIQ1_SIQ4_NLIQ3_SIQ2_SIQ4_XSI8I16I32I64F64IQ1_MBF16Q4_0_4_4Q4_0_4_8Q4_0_8_8TQ1_0TQ2_0IQ4_NL_4_4IQ4_NL_4_8IQ4_NL_8_8MXFP4Unknown" -var _GGMLType_index = [...]uint8{0, 3, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62, 69, 75, 82, 87, 93, 98, 103, 109, 111, 114, 117, 120, 123, 128, 132, 140, 148, 156, 161, 166, 176, 186, 196, 203} +var _GGMLType_index = [...]uint8{0, 3, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62, 69, 75, 82, 87, 93, 98, 103, 109, 111, 114, 117, 120, 123, 128, 132, 140, 148, 156, 161, 166, 176, 186, 196, 201, 208} func (i GGMLType) String() string { if i >= GGMLType(len(_GGMLType_index)-1) { diff --git a/vendor/github.com/gpustack/gguf-parser-go/zz_generated.gguffiletype.stringer.go b/vendor/github.com/gpustack/gguf-parser-go/zz_generated.gguffiletype.stringer.go index a6abaa22..ba7f6385 100644 --- a/vendor/github.com/gpustack/gguf-parser-go/zz_generated.gguffiletype.stringer.go +++ b/vendor/github.com/gpustack/gguf-parser-go/zz_generated.gguffiletype.stringer.go @@ -8,45 +8,51 @@ func _() { // An "invalid array index" compiler error signifies that the constant values have changed. // Re-run the stringer command to generate them again. var x [1]struct{} - _ = x[GGUFFileTypeAllF32-0] + _ = x[GGUFFileTypeMostlyF32-0] _ = x[GGUFFileTypeMostlyF16-1] _ = x[GGUFFileTypeMostlyQ4_0-2] _ = x[GGUFFileTypeMostlyQ4_1-3] - _ = x[GGUFFileTypeMostlyQ4_1_F16-4] + _ = x[GGUFFileTypeMostlyQ4_1_SOME_F16-4] _ = x[GGUFFileTypeMostlyQ4_2-5] _ = x[GGUFFileTypeMostlyQ4_3-6] _ = x[GGUFFileTypeMostlyQ8_0-7] _ = x[GGUFFileTypeMostlyQ5_0-8] _ = x[GGUFFileTypeMostlyQ5_1-9] _ = x[GGUFFileTypeMostlyQ2_K-10] - _ = x[GGUFFileTypeMostlyQ3_K-11] - _ = x[GGUFFileTypeMostlyQ4_K-12] - _ = x[GGUFFileTypeMostlyQ5_K-13] - _ = x[GGUFFileTypeMostlyQ6_K-14] - _ = x[GGUFFileTypeMostlyIQ2_XXS-15] - _ = x[GGUFFileTypeMostlyIQ2_XS-16] - _ = x[GGUFFileTypeMostlyIQ3_XXS-17] - _ = x[GGUFFileTypeMostlyIQ1_S-18] - _ = x[GGUFFileTypeMostlyIQ4_NL-19] - _ = x[GGUFFileTypeMostlyIQ3_S-20] - _ = x[GGUFFileTypeMostlyIQ2_S-21] - _ = x[GGUFFileTypeMostlyIQ4_XS-22] - _ = x[GGUFFileTypeMostlyIQ1_M-23] - _ = x[GGUFFileTypeMostlyBF16-24] - _ = x[GGUFFileTypeMostlyQ4_0_4_4-25] - _ = x[GGUFFileTypeMostlyQ4_0_4_8-26] - _ = x[GGUFFileTypeMostlyQ4_0_8_8-27] - _ = x[GGUFFileTypeMostlyTQ1_0-28] - _ = x[GGUFFileTypeMostlyTQ2_0-29] - _ = x[GGUFFileTypeMostlyIQ4_NL_4_4-30] - _ = x[GGUFFileTypeMostlyIQ4_NL_4_8-31] - _ = x[GGUFFileTypeMostlyIQ4_NL_8_8-32] - _ = x[_GGUFFileTypeCount-33] + _ = x[GGUFFileTypeMostlyQ3_K_S-11] + _ = x[GGUFFileTypeMostlyQ3_K_M-12] + _ = x[GGUFFileTypeMostlyQ3_K_L-13] + _ = x[GGUFFileTypeMostlyQ4_K_S-14] + _ = x[GGUFFileTypeMostlyQ4_K_M-15] + _ = x[GGUFFileTypeMostlyQ5_K_S-16] + _ = x[GGUFFileTypeMostlyQ5_K_M-17] + _ = x[GGUFFileTypeMostlyQ6_K-18] + _ = x[GGUFFileTypeMostlyIQ2_XXS-19] + _ = x[GGUFFileTypeMostlyIQ2_XS-20] + _ = x[GGUFFileTypeMostlyQ2_K_S-21] + _ = x[GGUFFileTypeMostlyIQ3_XS-22] + _ = x[GGUFFileTypeMostlyIQ3_XXS-23] + _ = x[GGUFFileTypeMostlyIQ1_S-24] + _ = x[GGUFFileTypeMostlyIQ4_NL-25] + _ = x[GGUFFileTypeMostlyIQ3_S-26] + _ = x[GGUFFileTypeMostlyIQ3_M-27] + _ = x[GGUFFileTypeMostlyIQ2_S-28] + _ = x[GGUFFileTypeMostlyIQ2_M-29] + _ = x[GGUFFileTypeMostlyIQ4_XS-30] + _ = x[GGUFFileTypeMostlyIQ1_M-31] + _ = x[GGUFFileTypeMostlyBF16-32] + _ = x[GGUFFileTypeMostlyQ4_0_4_4-33] + _ = x[GGUFFileTypeMostlyQ4_0_4_8-34] + _ = x[GGUFFileTypeMostlyQ4_0_8_8-35] + _ = x[GGUFFileTypeMostlyTQ1_0-36] + _ = x[GGUFFileTypeMostlyTQ2_0-37] + _ = x[GGUFFileTypeMostlyMXFP4-38] + _ = x[_GGUFFileTypeCount-39] } -const _GGUFFileType_name = "F32F16Q4_0Q4_1Q4_1_F16Q4_2Q4_3Q8_0Q5_0Q5_1Q2_KQ3_K/Q3_K_SQ4_K/Q3_K_MQ5_K/Q3_K_LQ6_K/Q4_K_SIQ2_XXS/Q4_K_MIQ2_XS/Q5_K_SIQ3_XXS/Q5_K_MIQ1_S/Q6_KIQ4_NLIQ3_SIQ2_SIQ4_XSIQ1_MBF16Q4_0_4x4Q4_0_4x8Q4_0_8x8TQ1_0TQ2_0IQ4_NL_4x4IQ4_NL_4x8IQ4_NL_8x8Unknown" +const _GGUFFileType_name = "MOSTLY_F32MOSTLY_F16MOSTLY_Q4_0MOSTLY_Q4_1MOSTLY_Q4_1_SOME_F16MOSTLY_Q4_2MOSTLY_Q4_3MOSTLY_Q8_0MOSTLY_Q5_0MOSTLY_Q5_1MOSTLY_Q2_KMOSTLY_Q3_K_SMOSTLY_Q3_K_MMOSTLY_Q3_K_LMOSTLY_Q4_K_SMOSTLY_Q4_K_MMOSTLY_Q5_K_SMOSTLY_Q5_K_MMOSTLY_Q6_KMOSTLY_IQ2_XXSMOSTLY_IQ2_XSMOSTLY_Q2_K_SMOSTLY_IQ3_XSMOSTLY_IQ3_XXSMOSTLY_IQ1_SMOSTLY_IQ4_NLMOSTLY_IQ3_SMOSTLY_IQ3_MMOSTLY_IQ2_SMOSTLY_IQ2_MMOSTLY_IQ4_XSMOSTLY_IQ1_MMOSTLY_BF16MOSTLY_Q4_0_4_4MOSTLY_Q4_0_4_8MOSTLY_Q4_0_8_8MOSTLY_TQ1_0MOSTLY_TQ2_0MOSTLY_MXFP4Unknown" -var _GGUFFileType_index = [...]uint8{0, 3, 6, 10, 14, 22, 26, 30, 34, 38, 42, 46, 57, 68, 79, 90, 104, 117, 131, 141, 147, 152, 157, 163, 168, 172, 180, 188, 196, 201, 206, 216, 226, 236, 243} +var _GGUFFileType_index = [...]uint16{0, 10, 20, 31, 42, 62, 73, 84, 95, 106, 117, 128, 141, 154, 167, 180, 193, 206, 219, 230, 244, 257, 270, 283, 297, 309, 322, 334, 346, 358, 370, 383, 395, 406, 421, 436, 451, 463, 475, 487, 494} func (i GGUFFileType) String() string { if i >= GGUFFileType(len(_GGUFFileType_index)-1) { diff --git a/vendor/github.com/rs/dnscache/.travis.yml b/vendor/github.com/rs/dnscache/.travis.yml deleted file mode 100644 index ce47932b..00000000 --- a/vendor/github.com/rs/dnscache/.travis.yml +++ /dev/null @@ -1,13 +0,0 @@ -language: go -go: - - "1.8" - - "1.9" - - "1.10" - - "1.11" - - "1.12" - - tip -matrix: - allow_failures: - - go: tip -script: - go test -v -race -cpu=1,2,4 -bench . -benchmem ./... diff --git a/vendor/github.com/rs/dnscache/LICENSE b/vendor/github.com/rs/dnscache/LICENSE deleted file mode 100644 index 71abfee3..00000000 --- a/vendor/github.com/rs/dnscache/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2018 Olivier Poitrey - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/vendor/github.com/rs/dnscache/README.md b/vendor/github.com/rs/dnscache/README.md deleted file mode 100644 index 267c6996..00000000 --- a/vendor/github.com/rs/dnscache/README.md +++ /dev/null @@ -1,78 +0,0 @@ -# DNS Lookup Cache - -[![license](http://img.shields.io/badge/license-MIT-red.svg?style=flat)](https://raw.githubusercontent.com/rs/dnscache/master/LICENSE) -[![Go Report Card](https://goreportcard.com/badge/github.com/rs/dnscache)](https://goreportcard.com/report/github.com/rs/dnscache) -[![Build Status](https://travis-ci.org/rs/dnscache.svg?branch=master)](https://travis-ci.org/rs/dnscache) -[![Coverage](http://gocover.io/_badge/github.com/rs/dnscache)](http://gocover.io/github.com/rs/dnscache) -[![godoc](http://img.shields.io/badge/godoc-reference-blue.svg?style=flat)](https://godoc.org/github.com/rs/dnscache) - -The dnscache package provides a DNS cache layer to Go's `net.Resolver`. - -# Install - -Install using the "go get" command: - -``` -go get -u github.com/rs/dnscache -``` - -# Usage - -Create a new instance and use it in place of `net.Resolver`. New names will be cached. Call the `Refresh` method at regular interval to update cached entries and cleanup unused ones. - -```go -resolver := &dnscache.Resolver{} - -// First call will cache the result -addrs, err := resolver.LookupHost(context.Background(), "example.com") - -// Subsequent calls will use the cached result -addrs, err = resolver.LookupHost(context.Background(), "example.com") - -// Call to refresh will refresh names in cache. If you pass true, it will also -// remove cached names not looked up since the last call to Refresh. It is a good idea -// to call this method on a regular interval. -go func() { - t := time.NewTicker(5 * time.Minute) - defer t.Stop() - for range t.C { - resolver.Refresh(true) - } -}() -``` - -If you are using an `http.Transport`, you can use this cache by specifying a `DialContext` function: - -```go -r := &dnscache.Resolver{} -t := &http.Transport{ - DialContext: func(ctx context.Context, network string, addr string) (conn net.Conn, err error) { - host, port, err := net.SplitHostPort(addr) - if err != nil { - return nil, err - } - ips, err := r.LookupHost(ctx, host) - if err != nil { - return nil, err - } - for _, ip := range ips { - var dialer net.Dialer - conn, err = dialer.DialContext(ctx, network, net.JoinHostPort(ip, port)) - if err == nil { - break - } - } - return - }, -} -``` - -In addition to the `Refresh` method, you can `RefreshWithOptions`. This method adds an option to persist resource records -on failed lookups -```go -r := &Resolver{} -options := dnscache.ResolverRefreshOptions{} -options.ClearUnused = true -options.PersistOnFailure = false -resolver.RefreshWithOptions(options) -``` diff --git a/vendor/github.com/rs/dnscache/dnscache.go b/vendor/github.com/rs/dnscache/dnscache.go deleted file mode 100644 index ddbb923f..00000000 --- a/vendor/github.com/rs/dnscache/dnscache.go +++ /dev/null @@ -1,308 +0,0 @@ -package dnscache - -import ( - "context" - "net" - "net/http/httptrace" - "sync" - "time" - - "golang.org/x/sync/singleflight" -) - -type DNSResolver interface { - LookupHost(ctx context.Context, host string) (addrs []string, err error) - LookupAddr(ctx context.Context, addr string) (names []string, err error) -} - -type Resolver struct { - // Timeout defines the maximum allowed time allowed for a lookup. - Timeout time.Duration - - // Resolver is used to perform actual DNS lookup. If nil, - // net.DefaultResolver is used instead. - Resolver DNSResolver - - once sync.Once - mu sync.RWMutex - cache map[string]*cacheEntry - - // OnCacheMiss is executed if the host or address is not included in - // the cache and the default lookup is executed. - OnCacheMiss func() -} - -type ResolverRefreshOptions struct { - ClearUnused bool - PersistOnFailure bool -} - -type cacheEntry struct { - rrs []string - err error - used bool -} - -// LookupAddr performs a reverse lookup for the given address, returning a list -// of names mapping to that address. -func (r *Resolver) LookupAddr(ctx context.Context, addr string) (names []string, err error) { - r.once.Do(r.init) - return r.lookup(ctx, "r"+addr) -} - -// LookupHost looks up the given host using the local resolver. It returns a -// slice of that host's addresses. -func (r *Resolver) LookupHost(ctx context.Context, host string) (addrs []string, err error) { - r.once.Do(r.init) - return r.lookup(ctx, "h"+host) -} - -// refreshRecords refreshes cached entries which have been used at least once since -// the last Refresh. If clearUnused is true, entries which haven't be used since the -// last Refresh are removed from the cache. If persistOnFailure is true, stale -// entries will not be removed on failed lookups -func (r *Resolver) refreshRecords(clearUnused bool, persistOnFailure bool) { - r.once.Do(r.init) - r.mu.RLock() - update := make([]string, 0, len(r.cache)) - del := make([]string, 0, len(r.cache)) - for key, entry := range r.cache { - if entry.used { - update = append(update, key) - } else if clearUnused { - del = append(del, key) - } - } - r.mu.RUnlock() - - if len(del) > 0 { - r.mu.Lock() - for _, key := range del { - delete(r.cache, key) - } - r.mu.Unlock() - } - - for _, key := range update { - r.update(context.Background(), key, false, persistOnFailure) - } -} - -func (r *Resolver) Refresh(clearUnused bool) { - r.refreshRecords(clearUnused, false) -} - -func (r *Resolver) RefreshWithOptions(options ResolverRefreshOptions) { - r.refreshRecords(options.ClearUnused, options.PersistOnFailure) -} - -func (r *Resolver) init() { - r.cache = make(map[string]*cacheEntry) -} - -// lookupGroup merges lookup calls together for lookups for the same host. The -// lookupGroup key is is the LookupIPAddr.host argument. -var lookupGroup singleflight.Group - -func (r *Resolver) lookup(ctx context.Context, key string) (rrs []string, err error) { - var found bool - rrs, err, found = r.load(key) - if !found { - if r.OnCacheMiss != nil { - r.OnCacheMiss() - } - rrs, err = r.update(ctx, key, true, false) - } - return -} - -func (r *Resolver) update(ctx context.Context, key string, used bool, persistOnFailure bool) (rrs []string, err error) { - c := lookupGroup.DoChan(key, r.lookupFunc(ctx, key)) - select { - case <-ctx.Done(): - err = ctx.Err() - if err == context.DeadlineExceeded { - // If DNS request timed out for some reason, force future - // request to start the DNS lookup again rather than waiting - // for the current lookup to complete. - lookupGroup.Forget(key) - } - case res := <-c: - if res.Shared { - // We had concurrent lookups, check if the cache is already updated - // by a friend. - var found bool - rrs, err, found = r.load(key) - if found { - return - } - } - err = res.Err - if err == nil { - rrs, _ = res.Val.([]string) - } - - if err != nil && persistOnFailure { - var found bool - rrs, err, found = r.load(key) - if found { - return - } - } - - r.mu.Lock() - r.storeLocked(key, rrs, used, err) - r.mu.Unlock() - } - return -} - -// lookupFunc returns lookup function for key. The type of the key is stored as -// the first char and the lookup subject is the rest of the key. -func (r *Resolver) lookupFunc(ctx context.Context, key string) func() (interface{}, error) { - if len(key) == 0 { - panic("lookupFunc with empty key") - } - - var resolver DNSResolver = defaultResolver - if r.Resolver != nil { - resolver = r.Resolver - } - - switch key[0] { - case 'h': - return func() (interface{}, error) { - ctx, cancel := r.prepareCtx(ctx) - defer cancel() - - return resolver.LookupHost(ctx, key[1:]) - } - case 'r': - return func() (interface{}, error) { - ctx, cancel := r.prepareCtx(ctx) - defer cancel() - - return resolver.LookupAddr(ctx, key[1:]) - } - default: - panic("lookupFunc invalid key type: " + key) - } -} - -func (r *Resolver) prepareCtx(origContext context.Context) (ctx context.Context, cancel context.CancelFunc) { - ctx = context.Background() - if r.Timeout > 0 { - ctx, cancel = context.WithTimeout(ctx, r.Timeout) - } else { - cancel = func() {} - } - - // If a httptrace has been attached to the given context it will be copied over to the newly created context. We only need to copy pointers - // to DNSStart and DNSDone hooks - if trace := httptrace.ContextClientTrace(origContext); trace != nil { - derivedTrace := &httptrace.ClientTrace{ - DNSStart: trace.DNSStart, - DNSDone: trace.DNSDone, - } - - ctx = httptrace.WithClientTrace(ctx, derivedTrace) - } - - return -} - -func (r *Resolver) load(key string) (rrs []string, err error, found bool) { - r.mu.RLock() - var entry *cacheEntry - entry, found = r.cache[key] - if !found { - r.mu.RUnlock() - return - } - rrs = entry.rrs - err = entry.err - used := entry.used - r.mu.RUnlock() - if !used { - r.mu.Lock() - entry.used = true - r.mu.Unlock() - } - return rrs, err, true -} - -func (r *Resolver) storeLocked(key string, rrs []string, used bool, err error) { - if entry, found := r.cache[key]; found { - // Update existing entry in place - entry.rrs = rrs - entry.err = err - entry.used = used - return - } - r.cache[key] = &cacheEntry{ - rrs: rrs, - err: err, - used: used, - } -} - -var defaultResolver = &defaultResolverWithTrace{ - ipVersion: "ip", -} - -// Create a new resolver that only resolves to IPv4 Addresses when looking up Hosts. -// Example: -// -// resolver := dnscache.Resolver{ -// Resolver: NewResolverOnlyV4(), -// } -func NewResolverOnlyV4() DNSResolver { - return &defaultResolverWithTrace{ - ipVersion: "ip4", - } -} - -// Create a new resolver that only resolves to IPv6 Addresses when looking up Hosts. -// Example: -// -// resolver := dnscache.Resolver{ -// Resolver: NewResolverOnlyV6(), -// } -func NewResolverOnlyV6() DNSResolver { - return &defaultResolverWithTrace{ - ipVersion: "ip6", - } -} - -// defaultResolverWithTrace calls `LookupIP` instead of `LookupHost` on `net.DefaultResolver` in order to cause invocation of the `DNSStart` -// and `DNSDone` hooks. By implementing `DNSResolver`, backward compatibility can be ensured. -type defaultResolverWithTrace struct { - ipVersion string -} - -func (d *defaultResolverWithTrace) LookupHost(ctx context.Context, host string) (addrs []string, err error) { - ipVersion := d.ipVersion - if ipVersion != "ip" && ipVersion != "ip4" && ipVersion != "ip6" { - ipVersion = "ip" - } - - // `net.Resolver#LookupHost` does not cause invocation of `net.Resolver#lookupIPAddr`, therefore the `DNSStart` and `DNSDone` tracing hooks - // built into the stdlib are never called. `LookupIP`, despite it's name, can also be used to lookup a hostname but does cause these hooks to be - // triggered. The format of the reponse is different, therefore it needs this thin wrapper converting it. - rawIPs, err := net.DefaultResolver.LookupIP(ctx, ipVersion, host) - if err != nil { - return nil, err - } - - cookedIPs := make([]string, len(rawIPs)) - - for i, v := range rawIPs { - cookedIPs[i] = v.String() - } - - return cookedIPs, nil -} - -func (d *defaultResolverWithTrace) LookupAddr(ctx context.Context, addr string) (names []string, err error) { - return net.DefaultResolver.LookupAddr(ctx, addr) -} diff --git a/vendor/golang.org/x/sync/singleflight/singleflight.go b/vendor/golang.org/x/sync/singleflight/singleflight.go deleted file mode 100644 index 40518309..00000000 --- a/vendor/golang.org/x/sync/singleflight/singleflight.go +++ /dev/null @@ -1,214 +0,0 @@ -// Copyright 2013 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Package singleflight provides a duplicate function call suppression -// mechanism. -package singleflight // import "golang.org/x/sync/singleflight" - -import ( - "bytes" - "errors" - "fmt" - "runtime" - "runtime/debug" - "sync" -) - -// errGoexit indicates the runtime.Goexit was called in -// the user given function. -var errGoexit = errors.New("runtime.Goexit was called") - -// A panicError is an arbitrary value recovered from a panic -// with the stack trace during the execution of given function. -type panicError struct { - value interface{} - stack []byte -} - -// Error implements error interface. -func (p *panicError) Error() string { - return fmt.Sprintf("%v\n\n%s", p.value, p.stack) -} - -func (p *panicError) Unwrap() error { - err, ok := p.value.(error) - if !ok { - return nil - } - - return err -} - -func newPanicError(v interface{}) error { - stack := debug.Stack() - - // The first line of the stack trace is of the form "goroutine N [status]:" - // but by the time the panic reaches Do the goroutine may no longer exist - // and its status will have changed. Trim out the misleading line. - if line := bytes.IndexByte(stack[:], '\n'); line >= 0 { - stack = stack[line+1:] - } - return &panicError{value: v, stack: stack} -} - -// call is an in-flight or completed singleflight.Do call -type call struct { - wg sync.WaitGroup - - // These fields are written once before the WaitGroup is done - // and are only read after the WaitGroup is done. - val interface{} - err error - - // These fields are read and written with the singleflight - // mutex held before the WaitGroup is done, and are read but - // not written after the WaitGroup is done. - dups int - chans []chan<- Result -} - -// Group represents a class of work and forms a namespace in -// which units of work can be executed with duplicate suppression. -type Group struct { - mu sync.Mutex // protects m - m map[string]*call // lazily initialized -} - -// Result holds the results of Do, so they can be passed -// on a channel. -type Result struct { - Val interface{} - Err error - Shared bool -} - -// Do executes and returns the results of the given function, making -// sure that only one execution is in-flight for a given key at a -// time. If a duplicate comes in, the duplicate caller waits for the -// original to complete and receives the same results. -// The return value shared indicates whether v was given to multiple callers. -func (g *Group) Do(key string, fn func() (interface{}, error)) (v interface{}, err error, shared bool) { - g.mu.Lock() - if g.m == nil { - g.m = make(map[string]*call) - } - if c, ok := g.m[key]; ok { - c.dups++ - g.mu.Unlock() - c.wg.Wait() - - if e, ok := c.err.(*panicError); ok { - panic(e) - } else if c.err == errGoexit { - runtime.Goexit() - } - return c.val, c.err, true - } - c := new(call) - c.wg.Add(1) - g.m[key] = c - g.mu.Unlock() - - g.doCall(c, key, fn) - return c.val, c.err, c.dups > 0 -} - -// DoChan is like Do but returns a channel that will receive the -// results when they are ready. -// -// The returned channel will not be closed. -func (g *Group) DoChan(key string, fn func() (interface{}, error)) <-chan Result { - ch := make(chan Result, 1) - g.mu.Lock() - if g.m == nil { - g.m = make(map[string]*call) - } - if c, ok := g.m[key]; ok { - c.dups++ - c.chans = append(c.chans, ch) - g.mu.Unlock() - return ch - } - c := &call{chans: []chan<- Result{ch}} - c.wg.Add(1) - g.m[key] = c - g.mu.Unlock() - - go g.doCall(c, key, fn) - - return ch -} - -// doCall handles the single call for a key. -func (g *Group) doCall(c *call, key string, fn func() (interface{}, error)) { - normalReturn := false - recovered := false - - // use double-defer to distinguish panic from runtime.Goexit, - // more details see https://golang.org/cl/134395 - defer func() { - // the given function invoked runtime.Goexit - if !normalReturn && !recovered { - c.err = errGoexit - } - - g.mu.Lock() - defer g.mu.Unlock() - c.wg.Done() - if g.m[key] == c { - delete(g.m, key) - } - - if e, ok := c.err.(*panicError); ok { - // In order to prevent the waiting channels from being blocked forever, - // needs to ensure that this panic cannot be recovered. - if len(c.chans) > 0 { - go panic(e) - select {} // Keep this goroutine around so that it will appear in the crash dump. - } else { - panic(e) - } - } else if c.err == errGoexit { - // Already in the process of goexit, no need to call again - } else { - // Normal return - for _, ch := range c.chans { - ch <- Result{c.val, c.err, c.dups > 0} - } - } - }() - - func() { - defer func() { - if !normalReturn { - // Ideally, we would wait to take a stack trace until we've determined - // whether this is a panic or a runtime.Goexit. - // - // Unfortunately, the only way we can distinguish the two is to see - // whether the recover stopped the goroutine from terminating, and by - // the time we know that, the part of the stack trace relevant to the - // panic has been discarded. - if r := recover(); r != nil { - c.err = newPanicError(r) - } - } - }() - - c.val, c.err = fn() - normalReturn = true - }() - - if !normalReturn { - recovered = true - } -} - -// Forget tells the singleflight to forget about a key. Future calls -// to Do for this key will call the function rather than waiting for -// an earlier call to complete. -func (g *Group) Forget(key string) { - g.mu.Lock() - delete(g.m, key) - g.mu.Unlock() -} diff --git a/vendor/modules.txt b/vendor/modules.txt index f05f1c38..23ea2b91 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -144,8 +144,8 @@ github.com/docker/go-connections/tlsconfig # github.com/docker/go-units v0.5.0 ## explicit github.com/docker/go-units -# github.com/docker/model-distribution v0.0.0-20250905083217-3f098b3d8058 -## explicit; go 1.23.0 +# github.com/docker/model-distribution v0.0.0-20250918153037-7d9fc7b72b57 +## explicit; go 1.24 github.com/docker/model-distribution/builder github.com/docker/model-distribution/distribution github.com/docker/model-distribution/internal/bundle @@ -246,7 +246,7 @@ github.com/google/go-containerregistry/pkg/v1/types github.com/google/uuid # github.com/gorilla/mux v1.8.1 ## explicit; go 1.20 -# github.com/gpustack/gguf-parser-go v0.14.1 +# github.com/gpustack/gguf-parser-go v0.22.1 ## explicit; go 1.22.0 github.com/gpustack/gguf-parser-go github.com/gpustack/gguf-parser-go/util/anyx @@ -396,9 +396,6 @@ github.com/prometheus/procfs/internal/util # github.com/rivo/uniseg v0.4.7 ## explicit; go 1.18 github.com/rivo/uniseg -# github.com/rs/dnscache v0.0.0-20230804202142-fc85eb664529 -## explicit; go 1.12 -github.com/rs/dnscache # github.com/russross/blackfriday/v2 v2.1.0 ## explicit github.com/russross/blackfriday/v2 @@ -536,7 +533,6 @@ golang.org/x/net/trace ## explicit; go 1.23.0 golang.org/x/sync/errgroup golang.org/x/sync/semaphore -golang.org/x/sync/singleflight # golang.org/x/sys v0.35.0 ## explicit; go 1.23.0 golang.org/x/sys/cpu